update
This commit is contained in:
1
AzA march 2026/services/__init__.py
Normal file
1
AzA march 2026/services/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Services package for live search pipeline.
|
||||
220
AzA march 2026/services/event_extract_llm.py
Normal file
220
AzA march 2026/services/event_extract_llm.py
Normal file
@@ -0,0 +1,220 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from services.live_event_search import SearchResult
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EventCandidate:
|
||||
name: str
|
||||
startDate: str | None
|
||||
endDate: str | None
|
||||
city: str
|
||||
country: str
|
||||
urlCandidate: str
|
||||
shortDescription: str
|
||||
organizer: str
|
||||
specialtyTags: list[str]
|
||||
regionTags: list[str]
|
||||
rationale: str
|
||||
confidence: float
|
||||
|
||||
|
||||
def _clean_text(t: str) -> str:
|
||||
t = re.sub(r"<[^>]+>", " ", t or "")
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
return t
|
||||
|
||||
|
||||
def _safe_date(value: Any) -> str | None:
|
||||
s = str(value or "").strip()
|
||||
if not s:
|
||||
return None
|
||||
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s):
|
||||
return s
|
||||
return None
|
||||
|
||||
|
||||
def _extract_page_excerpt(url: str, max_chars: int = 14000) -> str:
|
||||
u = (url or "").strip()
|
||||
if not u:
|
||||
return ""
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
u,
|
||||
headers={"User-Agent": "AZA-LiveEventSearch/1.0"},
|
||||
method="GET",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=8) as resp:
|
||||
raw = resp.read(max_chars * 2).decode("utf-8", errors="ignore")
|
||||
return _clean_text(raw)[:max_chars]
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def _make_prompt(
|
||||
search_rows: list[SearchResult],
|
||||
specialty: str,
|
||||
regions: list[str],
|
||||
excerpts: list[dict[str, str]],
|
||||
) -> str:
|
||||
search_payload = [
|
||||
{"title": r.title, "snippet": r.snippet, "url": r.url}
|
||||
for r in search_rows
|
||||
]
|
||||
rules = (
|
||||
"Du extrahierst medizinische Kongresse/Weiterbildungen als STRICT JSON.\n"
|
||||
"NUR aus den gelieferten Daten, NICHT raten, NICHT erfinden.\n"
|
||||
"Wenn Datum/Ort unklar: null setzen und confidence reduzieren.\n"
|
||||
"urlCandidate MUSS eine gelieferte URL oder eine URL aus pageExcerpts sein.\n"
|
||||
"Antwortformat: JSON-Objekt mit key 'events' (Array).\n"
|
||||
"Jedes Event: {name,startDate,endDate,city,country,urlCandidate,shortDescription,organizer,specialtyTags,regionTags,rationale,confidence}\n"
|
||||
"startDate/endDate als ISO YYYY-MM-DD oder null.\n"
|
||||
"confidence zwischen 0 und 1."
|
||||
)
|
||||
payload = {
|
||||
"specialty": specialty,
|
||||
"regions": regions,
|
||||
"searchResults": search_payload,
|
||||
"pageExcerpts": excerpts,
|
||||
}
|
||||
return rules + "\n\nDATA:\n" + json.dumps(payload, ensure_ascii=False)
|
||||
|
||||
|
||||
def _extract_json_block(text: str) -> dict:
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
||||
if not match:
|
||||
return {"events": []}
|
||||
try:
|
||||
data = json.loads(match.group(0))
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
return {"events": []}
|
||||
|
||||
|
||||
def _call_openai_json(prompt: str) -> dict:
|
||||
key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
|
||||
model = os.getenv("EVENT_LLM_MODEL", "gpt-4o-mini").strip()
|
||||
client = OpenAI(api_key=key)
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
temperature=0,
|
||||
messages=[
|
||||
{"role": "system", "content": "Gib nur valides JSON zurück."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
txt = ""
|
||||
try:
|
||||
txt = (resp.choices[0].message.content or "").strip()
|
||||
except Exception:
|
||||
txt = ""
|
||||
return _extract_json_block(txt)
|
||||
|
||||
|
||||
def _call_gemini_json(prompt: str) -> dict:
|
||||
key = os.getenv("GEMINI_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise RuntimeError("GEMINI_API_KEY nicht gesetzt")
|
||||
model = os.getenv("GEMINI_MODEL", "gemini-1.5-flash").strip()
|
||||
url = (
|
||||
f"https://generativelanguage.googleapis.com/v1beta/models/"
|
||||
f"{urllib.parse.quote(model)}:generateContent?key={urllib.parse.quote(key)}"
|
||||
)
|
||||
body = {
|
||||
"contents": [{"parts": [{"text": prompt}]}],
|
||||
"generationConfig": {"temperature": 0},
|
||||
}
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={"Content-Type": "application/json"},
|
||||
data=json.dumps(body).encode("utf-8"),
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="ignore")
|
||||
data = json.loads(raw)
|
||||
txt = ""
|
||||
try:
|
||||
txt = data["candidates"][0]["content"]["parts"][0]["text"]
|
||||
except Exception:
|
||||
txt = ""
|
||||
return _extract_json_block(txt)
|
||||
|
||||
|
||||
def _normalize_candidate(row: dict, default_specialty: str, default_regions: list[str]) -> EventCandidate | None:
|
||||
if not isinstance(row, dict):
|
||||
return None
|
||||
name = str(row.get("name") or "").strip()
|
||||
url_candidate = str(row.get("urlCandidate") or "").strip()
|
||||
if not name or not url_candidate:
|
||||
return None
|
||||
try:
|
||||
confidence = float(row.get("confidence", 0.0))
|
||||
except Exception:
|
||||
confidence = 0.0
|
||||
confidence = max(0.0, min(1.0, confidence))
|
||||
specialty_tags = row.get("specialtyTags") if isinstance(row.get("specialtyTags"), list) else [default_specialty]
|
||||
region_tags = row.get("regionTags") if isinstance(row.get("regionTags"), list) else list(default_regions)
|
||||
return EventCandidate(
|
||||
name=name,
|
||||
startDate=_safe_date(row.get("startDate")),
|
||||
endDate=_safe_date(row.get("endDate")) or _safe_date(row.get("startDate")),
|
||||
city=str(row.get("city") or "").strip(),
|
||||
country=str(row.get("country") or "").strip(),
|
||||
urlCandidate=url_candidate,
|
||||
shortDescription=str(row.get("shortDescription") or "").strip()[:600],
|
||||
organizer=str(row.get("organizer") or "").strip(),
|
||||
specialtyTags=[str(x).strip().lower() for x in specialty_tags if str(x).strip()],
|
||||
regionTags=[str(x).strip().upper() for x in region_tags if str(x).strip()],
|
||||
rationale=str(row.get("rationale") or "").strip()[:300],
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def extract_event_candidates(
|
||||
search_rows: list[SearchResult],
|
||||
specialty: str,
|
||||
regions: list[str],
|
||||
) -> list[EventCandidate]:
|
||||
if not search_rows:
|
||||
return []
|
||||
top_rows = list(search_rows[:10])
|
||||
excerpts = []
|
||||
for row in top_rows:
|
||||
text = _extract_page_excerpt(row.url)
|
||||
if text:
|
||||
excerpts.append({"url": row.url, "excerpt": text})
|
||||
prompt = _make_prompt(top_rows, specialty=specialty, regions=regions, excerpts=excerpts)
|
||||
provider = os.getenv("LLM_PROVIDER", "openai").strip().lower()
|
||||
if provider == "gemini":
|
||||
payload = _call_gemini_json(prompt)
|
||||
else:
|
||||
payload = _call_openai_json(prompt)
|
||||
rows = payload.get("events") if isinstance(payload.get("events"), list) else []
|
||||
out: list[EventCandidate] = []
|
||||
for row in rows:
|
||||
cand = _normalize_candidate(row, default_specialty=specialty, default_regions=regions)
|
||||
if cand is not None:
|
||||
out.append(cand)
|
||||
return out
|
||||
|
||||
168
AzA march 2026/services/event_llm_direct.py
Normal file
168
AzA march 2026/services/event_llm_direct.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""Kongress-Suche via OpenAI web search – so simpel wie eine ChatGPT-Anfrage."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
SPECIALTY_DE = {
|
||||
"dermatology": "Dermatologie", "general-medicine": "Allgemeinmedizin",
|
||||
"internal-medicine": "Innere Medizin", "gynecology": "Gynäkologie",
|
||||
"anesthesiology": "Anästhesiologie", "cardiology": "Kardiologie",
|
||||
"oncology": "Onkologie", "pediatrics": "Pädiatrie",
|
||||
"neurology": "Neurologie", "psychiatry": "Psychiatrie",
|
||||
"surgery": "Chirurgie", "ophthalmology": "Ophthalmologie",
|
||||
"ent": "HNO", "urology": "Urologie", "orthopedics": "Orthopädie",
|
||||
"radiology": "Radiologie", "rheumatology": "Rheumatologie",
|
||||
"endocrinology": "Endokrinologie", "gastroenterology": "Gastroenterologie",
|
||||
"pulmonology": "Pneumologie", "nephrology": "Nephrologie",
|
||||
"infectiology": "Infektiologie", "emergency-medicine": "Notfallmedizin",
|
||||
"pathology": "Pathologie", "allergology": "Allergologie",
|
||||
}
|
||||
|
||||
REGION_DE = {"CH": "Schweiz", "EU": "Europa", "WORLD": "weltweit", "US": "USA", "CA": "Kanada"}
|
||||
|
||||
COUNTRY_MAP = {
|
||||
"schweiz": "CH", "suisse": "CH", "switzerland": "CH",
|
||||
"deutschland": "DE", "germany": "DE", "österreich": "AT", "austria": "AT",
|
||||
"frankreich": "FR", "france": "FR", "italien": "IT", "italy": "IT",
|
||||
"spanien": "ES", "spain": "ES", "grossbritannien": "GB", "uk": "GB",
|
||||
"united kingdom": "GB", "griechenland": "GR", "greece": "GR",
|
||||
"niederlande": "NL", "netherlands": "NL", "usa": "US", "united states": "US",
|
||||
"finnland": "FI", "finland": "FI", "dänemark": "DK", "denmark": "DK",
|
||||
"schweden": "SE", "sweden": "SE", "portugal": "PT",
|
||||
"belgien": "BE", "belgium": "BE", "china": "CN", "japan": "JP",
|
||||
}
|
||||
|
||||
EU_SET = {"DE","AT","FR","IT","ES","GB","GR","NL","BE","PT","FI","DK","SE","CZ",
|
||||
"PL","IE","NO","HU","RO","BG","HR","SK","SI","LT","LV","EE","CY","MT","LU","CH"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class EventCandidate:
|
||||
name: str = ""
|
||||
startDate: str | None = None
|
||||
endDate: str | None = None
|
||||
city: str = ""
|
||||
country: str = ""
|
||||
urlCandidate: str = ""
|
||||
shortDescription: str = ""
|
||||
organizer: str = ""
|
||||
specialtyTags: list[str] | None = None
|
||||
regionTags: list[str] | None = None
|
||||
confidence: float = 0.9
|
||||
|
||||
|
||||
def _parse_json(text: str) -> list[dict]:
|
||||
"""Parse JSON aus LLM-Antwort – robust gegen ```json ... ``` Wrapper."""
|
||||
t = text.strip()
|
||||
t = re.sub(r"^```[a-zA-Z]*\s*", "", t)
|
||||
t = re.sub(r"\s*```\s*$", "", t)
|
||||
t = t.strip()
|
||||
try:
|
||||
obj = json.loads(t)
|
||||
if isinstance(obj, dict) and "events" in obj:
|
||||
return obj["events"]
|
||||
if isinstance(obj, list):
|
||||
return obj
|
||||
except Exception:
|
||||
pass
|
||||
m = re.search(r"\{[\s\S]*\}", t)
|
||||
if m:
|
||||
try:
|
||||
obj = json.loads(m.group(0))
|
||||
if isinstance(obj, dict) and "events" in obj:
|
||||
return obj["events"]
|
||||
except Exception:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
def _norm_country(raw: str) -> str:
|
||||
s = raw.strip()
|
||||
if len(s) == 2 and s.isalpha():
|
||||
return s.upper()
|
||||
return COUNTRY_MAP.get(s.lower(), s.upper()[:2] if len(s) >= 2 else s)
|
||||
|
||||
|
||||
def _region_tags(cc: str) -> list[str]:
|
||||
tags = set()
|
||||
if cc == "CH":
|
||||
tags.add("CH")
|
||||
if cc in EU_SET:
|
||||
tags.add("EU")
|
||||
if cc in ("US", "CA"):
|
||||
tags.add("US")
|
||||
return sorted(tags) or ["EU"]
|
||||
|
||||
|
||||
def _safe_date(v) -> str | None:
|
||||
s = str(v or "").strip()
|
||||
return s if re.fullmatch(r"\d{4}-\d{2}-\d{2}", s) else None
|
||||
|
||||
|
||||
def query_events_direct(
|
||||
specialty: str,
|
||||
regions: list[str],
|
||||
from_date: date,
|
||||
to_date: date,
|
||||
lang: str = "de",
|
||||
limit: int = 40,
|
||||
) -> list[EventCandidate]:
|
||||
key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
|
||||
|
||||
spec = SPECIALTY_DE.get(specialty, specialty)
|
||||
reg = ", ".join(REGION_DE.get(r.upper(), r) for r in regions) or "Europa, Schweiz"
|
||||
today = date.today().isoformat()
|
||||
|
||||
prompt = (
|
||||
f"Suche im Internet nach den nächsten Kongressen und Weiterbildungen "
|
||||
f"für {spec} in {reg} ab heute ({today}) "
|
||||
f"bis {to_date.isoformat()}.\n\n"
|
||||
f"Liste alle wichtigen Kongresse auf die du findest (mindestens 15). "
|
||||
f"Gib die Antwort als JSON zurück:\n"
|
||||
'{"events": [{"name": "...", "startDate": "YYYY-MM-DD", '
|
||||
'"endDate": "YYYY-MM-DD", "city": "...", "country": "CH", '
|
||||
'"url": "...", "description": "...", "organizer": "..."}]}'
|
||||
)
|
||||
|
||||
model = os.getenv("EVENT_SEARCH_MODEL", "gpt-4o-mini-search-preview").strip()
|
||||
client = OpenAI(api_key=key, timeout=80)
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "Suche im Internet nach echten Kongressen. Antwort NUR als JSON."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
txt = (resp.choices[0].message.content or "").strip()
|
||||
rows = _parse_json(txt)
|
||||
|
||||
out: list[EventCandidate] = []
|
||||
for r in rows:
|
||||
if not isinstance(r, dict):
|
||||
continue
|
||||
name = str(r.get("name") or "").strip()
|
||||
if not name:
|
||||
continue
|
||||
cc = _norm_country(str(r.get("country") or ""))
|
||||
out.append(EventCandidate(
|
||||
name=name,
|
||||
startDate=_safe_date(r.get("startDate")),
|
||||
endDate=_safe_date(r.get("endDate")) or _safe_date(r.get("startDate")),
|
||||
city=str(r.get("city") or "").strip(),
|
||||
country=cc,
|
||||
urlCandidate=str(r.get("url") or r.get("urlCandidate") or "").strip(),
|
||||
shortDescription=str(r.get("description") or r.get("shortDescription") or "").strip()[:600],
|
||||
organizer=str(r.get("organizer") or "").strip(),
|
||||
specialtyTags=[specialty],
|
||||
regionTags=_region_tags(cc),
|
||||
confidence=0.9,
|
||||
))
|
||||
return out
|
||||
167
AzA march 2026/services/link_verify.py
Normal file
167
AzA march 2026/services/link_verify.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import threading
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60
|
||||
_verify_lock = threading.Lock()
|
||||
_verify_cache: dict[str, tuple["LinkVerification", float]] = {}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LinkVerification:
|
||||
httpStatus: int
|
||||
finalUrl: str
|
||||
redirectCount: int
|
||||
isDirectEventPage: bool
|
||||
checkedAt: str
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _url_slug_tokens(url: str) -> set[str]:
|
||||
path = urlparse(url).path.lower()
|
||||
parts = [p for p in re.split(r"[^a-z0-9]+", path) if p]
|
||||
return set(parts)
|
||||
|
||||
|
||||
def _name_tokens(name: str) -> set[str]:
|
||||
return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4}
|
||||
|
||||
|
||||
def _extract_title(html: str) -> str:
|
||||
m = re.search(r"<title[^>]*>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
|
||||
if not m:
|
||||
return ""
|
||||
t = re.sub(r"\s+", " ", m.group(1)).strip()
|
||||
return t[:300]
|
||||
|
||||
|
||||
def _looks_generic_url(final_url: str) -> bool:
|
||||
path = (urlparse(final_url).path or "/").strip().lower().rstrip("/")
|
||||
return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"}
|
||||
|
||||
|
||||
def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool:
|
||||
score = 0
|
||||
slug = _url_slug_tokens(final_url)
|
||||
name_tokens = _name_tokens(event_name)
|
||||
title_l = (title_text or "").lower()
|
||||
if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()):
|
||||
score += 2
|
||||
if slug.intersection(name_tokens):
|
||||
score += 2
|
||||
if any(tok in title_l for tok in list(name_tokens)[:5]):
|
||||
score += 1
|
||||
if re.search(r"\b20\d{2}\b", title_l):
|
||||
score += 1
|
||||
if _looks_generic_url(final_url):
|
||||
score -= 2
|
||||
return score >= 2
|
||||
|
||||
|
||||
def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]:
|
||||
hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE)
|
||||
base_host = (urlparse(base_url).netloc or "").lower()
|
||||
tokens = _name_tokens(event_name)
|
||||
out: list[str] = []
|
||||
for href in hrefs:
|
||||
u = urljoin(base_url, href)
|
||||
p = urlparse(u)
|
||||
if p.scheme not in {"http", "https"}:
|
||||
continue
|
||||
if (p.netloc or "").lower() != base_host:
|
||||
continue
|
||||
path_l = p.path.lower()
|
||||
if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens):
|
||||
if u not in out:
|
||||
out.append(u)
|
||||
if len(out) >= 8:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None:
|
||||
headers = {"User-Agent": "AZA-LiveEventSearch/1.0"}
|
||||
try:
|
||||
r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers)
|
||||
if r.status_code >= 400 or r.status_code == 405:
|
||||
r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||||
return r
|
||||
except Exception:
|
||||
try:
|
||||
return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def verify_event_url(url: str, event_name: str) -> LinkVerification:
|
||||
u = (url or "").strip()
|
||||
if not u:
|
||||
return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||||
now_ts = datetime.now(timezone.utc).timestamp()
|
||||
with _verify_lock:
|
||||
cached = _verify_cache.get(u)
|
||||
if cached and cached[1] > now_ts:
|
||||
return cached[0]
|
||||
|
||||
resp = _request_with_fallback(u, timeout_s=5)
|
||||
if resp is None:
|
||||
out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
final_url = str(resp.url or u)
|
||||
status = int(resp.status_code or 0)
|
||||
redirects = len(resp.history or [])
|
||||
text = resp.text[:12000] if isinstance(resp.text, str) else ""
|
||||
title_text = _extract_title(text)
|
||||
is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text)
|
||||
|
||||
if (200 <= status < 300) and not is_direct and text:
|
||||
for cand in _extract_candidate_links(final_url, text, event_name=event_name):
|
||||
second = _request_with_fallback(cand, timeout_s=5)
|
||||
if second is None:
|
||||
continue
|
||||
second_status = int(second.status_code or 0)
|
||||
second_final = str(second.url or cand)
|
||||
second_redirects = len(second.history or [])
|
||||
second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "")
|
||||
second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title)
|
||||
if second_direct:
|
||||
out = LinkVerification(
|
||||
httpStatus=second_status,
|
||||
finalUrl=second_final,
|
||||
redirectCount=second_redirects,
|
||||
isDirectEventPage=True,
|
||||
checkedAt=_now_iso(),
|
||||
)
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
out = LinkVerification(
|
||||
httpStatus=status,
|
||||
finalUrl=final_url,
|
||||
redirectCount=redirects,
|
||||
isDirectEventPage=is_direct,
|
||||
checkedAt=_now_iso(),
|
||||
)
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
|
||||
def verification_to_dict(v: LinkVerification) -> dict[str, Any]:
|
||||
return asdict(v)
|
||||
|
||||
208
AzA march 2026/services/live_event_search.py
Normal file
208
AzA march 2026/services/live_event_search.py
Normal file
@@ -0,0 +1,208 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
class SearchProviderConfigError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SearchResult:
|
||||
title: str
|
||||
snippet: str
|
||||
url: str
|
||||
|
||||
|
||||
def _http_get_json(url: str, timeout: int = 10) -> dict:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={"User-Agent": "AZA-LiveEventSearch/1.0"},
|
||||
method="GET",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
payload = resp.read().decode("utf-8", errors="ignore")
|
||||
data = json.loads(payload)
|
||||
if not isinstance(data, dict):
|
||||
raise RuntimeError("Ungültige JSON-Antwort der Websuche")
|
||||
return data
|
||||
|
||||
|
||||
def _normalize_results(rows: Iterable[dict]) -> list[SearchResult]:
|
||||
out: list[SearchResult] = []
|
||||
for row in rows:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
title = str(row.get("title") or "").strip()
|
||||
snippet = str(row.get("snippet") or "").strip()
|
||||
url = str(row.get("url") or "").strip()
|
||||
if not url:
|
||||
continue
|
||||
out.append(SearchResult(title=title, snippet=snippet, url=url))
|
||||
return out
|
||||
|
||||
|
||||
def _search_google_cse(query: str, num_results: int) -> list[SearchResult]:
|
||||
key = os.getenv("GOOGLE_CSE_API_KEY", "").strip()
|
||||
cx = os.getenv("GOOGLE_CSE_CX", "").strip()
|
||||
if not key or not cx:
|
||||
raise SearchProviderConfigError("Google CSE nicht konfiguriert (GOOGLE_CSE_API_KEY/GOOGLE_CSE_CX fehlen).")
|
||||
params = urllib.parse.urlencode(
|
||||
{
|
||||
"key": key,
|
||||
"cx": cx,
|
||||
"q": query,
|
||||
"num": max(1, min(int(num_results), 10)),
|
||||
"safe": "off",
|
||||
}
|
||||
)
|
||||
url = f"https://www.googleapis.com/customsearch/v1?{params}"
|
||||
data = _http_get_json(url, timeout=10)
|
||||
items = data.get("items") if isinstance(data.get("items"), list) else []
|
||||
return _normalize_results(
|
||||
{
|
||||
"title": it.get("title", ""),
|
||||
"snippet": it.get("snippet", ""),
|
||||
"url": it.get("link", ""),
|
||||
}
|
||||
for it in items
|
||||
if isinstance(it, dict)
|
||||
)
|
||||
|
||||
|
||||
def _search_serpapi(query: str, num_results: int) -> list[SearchResult]:
|
||||
key = os.getenv("SERPAPI_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise SearchProviderConfigError("SerpAPI nicht konfiguriert (SERPAPI_API_KEY fehlt).")
|
||||
params = urllib.parse.urlencode(
|
||||
{
|
||||
"api_key": key,
|
||||
"engine": "google",
|
||||
"q": query,
|
||||
"num": max(1, min(int(num_results), 10)),
|
||||
}
|
||||
)
|
||||
url = f"https://serpapi.com/search.json?{params}"
|
||||
data = _http_get_json(url, timeout=12)
|
||||
rows = data.get("organic_results") if isinstance(data.get("organic_results"), list) else []
|
||||
return _normalize_results(
|
||||
{
|
||||
"title": it.get("title", ""),
|
||||
"snippet": it.get("snippet", ""),
|
||||
"url": it.get("link", ""),
|
||||
}
|
||||
for it in rows
|
||||
if isinstance(it, dict)
|
||||
)
|
||||
|
||||
|
||||
def _search_bing(query: str, num_results: int) -> list[SearchResult]:
|
||||
key = os.getenv("BING_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise SearchProviderConfigError("Bing Web Search nicht konfiguriert (BING_API_KEY fehlt).")
|
||||
params = urllib.parse.urlencode(
|
||||
{
|
||||
"q": query,
|
||||
"count": max(1, min(int(num_results), 10)),
|
||||
"textDecorations": "false",
|
||||
"textFormat": "Raw",
|
||||
}
|
||||
)
|
||||
url = f"https://api.bing.microsoft.com/v7.0/search?{params}"
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "AZA-LiveEventSearch/1.0",
|
||||
"Ocp-Apim-Subscription-Key": key,
|
||||
},
|
||||
method="GET",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
payload = resp.read().decode("utf-8", errors="ignore")
|
||||
data = json.loads(payload)
|
||||
rows = (((data or {}).get("webPages") or {}).get("value")) if isinstance(data, dict) else []
|
||||
if not isinstance(rows, list):
|
||||
rows = []
|
||||
return _normalize_results(
|
||||
{
|
||||
"title": it.get("name", ""),
|
||||
"snippet": it.get("snippet", ""),
|
||||
"url": it.get("url", ""),
|
||||
}
|
||||
for it in rows
|
||||
if isinstance(it, dict)
|
||||
)
|
||||
|
||||
|
||||
def search_web(query: str, num_results: int = 8) -> list[SearchResult]:
|
||||
provider = os.getenv("WEBSEARCH_PROVIDER", "").strip().lower()
|
||||
if provider == "google_cse":
|
||||
return _search_google_cse(query, num_results=num_results)
|
||||
if provider == "serpapi":
|
||||
return _search_serpapi(query, num_results=num_results)
|
||||
if provider == "bing":
|
||||
return _search_bing(query, num_results=num_results)
|
||||
raise SearchProviderConfigError(
|
||||
"Web Search nicht konfiguriert. Setze WEBSEARCH_PROVIDER auf google_cse, serpapi oder bing."
|
||||
)
|
||||
|
||||
|
||||
def build_queries(
|
||||
specialty: str,
|
||||
regions: list[str],
|
||||
from_date: date,
|
||||
to_date: date,
|
||||
lang: str = "de",
|
||||
max_queries: int = 10,
|
||||
) -> list[str]:
|
||||
spec = (specialty or "medical").strip()
|
||||
years: list[str] = []
|
||||
for y in range(from_date.year, to_date.year + 1):
|
||||
years.append(str(y))
|
||||
year_hint = " ".join(years[:3]) if years else str(from_date.year)
|
||||
|
||||
region_tokens: list[str] = []
|
||||
region_norm = {r.strip().upper() for r in regions if str(r).strip()}
|
||||
if "CH" in region_norm:
|
||||
region_tokens.extend(["Schweiz", "Suisse", "Switzerland"])
|
||||
if "EU" in region_norm:
|
||||
region_tokens.extend(["Europa", "Europe"])
|
||||
if "WORLD" in region_norm:
|
||||
region_tokens.extend(["global", "international"])
|
||||
if "US" in region_norm:
|
||||
region_tokens.extend(["USA", "United States"])
|
||||
if "CA" in region_norm:
|
||||
region_tokens.extend(["Canada"])
|
||||
if not region_tokens:
|
||||
region_tokens.extend(["Europe", "Switzerland"])
|
||||
|
||||
loc_hint = " ".join(dict.fromkeys(region_tokens))
|
||||
lang_hint = "Deutsch" if str(lang).lower().startswith("de") else "English"
|
||||
|
||||
base = [
|
||||
f"{spec} congress {year_hint} {loc_hint} dates registration",
|
||||
f"{spec} conference {year_hint} {loc_hint} official event page",
|
||||
f"{spec} course {year_hint} {loc_hint} CME",
|
||||
f"{spec} Weiterbildung {year_hint} {loc_hint}",
|
||||
f"{spec} Fortbildung {year_hint} {loc_hint}",
|
||||
f"{spec} Kongress {year_hint} {loc_hint}",
|
||||
f"{spec} congress {year_hint} site:org",
|
||||
f"{spec} symposium {year_hint} {loc_hint}",
|
||||
f"{spec} annual meeting {year_hint} {loc_hint}",
|
||||
f"{spec} event {year_hint} {loc_hint} {lang_hint}",
|
||||
]
|
||||
out: list[str] = []
|
||||
for q in base:
|
||||
q_clean = " ".join(str(q).split())
|
||||
if q_clean and q_clean not in out:
|
||||
out.append(q_clean)
|
||||
if len(out) >= max(6, min(int(max_queries), 12)):
|
||||
break
|
||||
return out
|
||||
|
||||
171
AzA march 2026/services/news_llm_search.py
Normal file
171
AzA march 2026/services/news_llm_search.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""Live medical news search using OpenAI web search (like ChatGPT)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, asdict
|
||||
from datetime import date
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
SPECIALTY_LABELS = {
|
||||
"dermatology": "Dermatologie",
|
||||
"general-medicine": "Allgemeinmedizin",
|
||||
"internal-medicine": "Innere Medizin",
|
||||
"gynecology": "Gynäkologie",
|
||||
"anesthesiology": "Anästhesiologie",
|
||||
"cardiology": "Kardiologie",
|
||||
"oncology": "Onkologie",
|
||||
"pediatrics": "Pädiatrie",
|
||||
"neurology": "Neurologie",
|
||||
"psychiatry": "Psychiatrie",
|
||||
"surgery": "Chirurgie",
|
||||
"ophthalmology": "Ophthalmologie",
|
||||
"ent": "HNO",
|
||||
"urology": "Urologie",
|
||||
"orthopedics": "Orthopädie",
|
||||
"radiology": "Radiologie",
|
||||
"rheumatology": "Rheumatologie",
|
||||
"endocrinology": "Endokrinologie",
|
||||
"gastroenterology": "Gastroenterologie",
|
||||
"pulmonology": "Pneumologie",
|
||||
"nephrology": "Nephrologie",
|
||||
"infectiology": "Infektiologie",
|
||||
"emergency-medicine": "Notfallmedizin",
|
||||
"pathology": "Pathologie",
|
||||
"allergology": "Allergologie",
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class NewsCandidate:
|
||||
title: str
|
||||
url: str
|
||||
source: str
|
||||
publishedAt: str
|
||||
summary: str
|
||||
tags: list[str]
|
||||
language: str
|
||||
|
||||
|
||||
def _extract_json_block(text: str) -> dict:
|
||||
cleaned = text.strip()
|
||||
cleaned = re.sub(r"^```[a-zA-Z]*\s*", "", cleaned)
|
||||
cleaned = re.sub(r"\s*```\s*$", "", cleaned)
|
||||
cleaned = cleaned.strip()
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
match = re.search(r"\{[\s\S]*\}", cleaned)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group(0))
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
return {"news": []}
|
||||
|
||||
|
||||
def _build_news_prompt(
|
||||
specialties: list[str],
|
||||
limit: int,
|
||||
) -> str:
|
||||
spec_labels = [SPECIALTY_LABELS.get(s, s) for s in specialties]
|
||||
spec_text = ", ".join(spec_labels) if spec_labels else "Medizin allgemein"
|
||||
today_str = date.today().isoformat()
|
||||
|
||||
return (
|
||||
f"Suche im Internet nach den NEUESTEN und WICHTIGSTEN medizinischen News "
|
||||
f"und Forschungsergebnissen. Heutiges Datum: {today_str}\n\n"
|
||||
f"Fachgebiete: {spec_text}\n\n"
|
||||
"Ich brauche aktuelle, relevante Nachrichten aus der Medizin:\n"
|
||||
"- Neue Studien und Forschungsergebnisse\n"
|
||||
"- Neue Therapien und Medikamente (Zulassungen, Phase-III-Ergebnisse)\n"
|
||||
"- Leitlinien-Updates\n"
|
||||
"- Wichtige Konferenz-Highlights und Abstracts\n"
|
||||
"- Gesundheitspolitische Nachrichten\n"
|
||||
"- Sicherheitswarnungen (FDA, EMA, Swissmedic)\n\n"
|
||||
"Bevorzuge Quellen wie: NEJM, Lancet, JAMA, BMJ, Nature Medicine, "
|
||||
"Deutsches Ärzteblatt, Swiss Medical Weekly, Medical Tribune, "
|
||||
"PubMed, Medscape, aerzteblatt.de\n\n"
|
||||
f"Liefere mindestens 10, maximal {limit} Ergebnisse.\n\n"
|
||||
"WICHTIG: Antwort als REINES JSON, kein anderer Text.\n"
|
||||
'{"news": [...]}\n'
|
||||
"Felder pro News-Item:\n"
|
||||
"title (Titel der Nachricht),\n"
|
||||
"url (DIREKTER Link zum Artikel),\n"
|
||||
"source (Name der Quelle, z.B. 'NEJM', 'Lancet'),\n"
|
||||
"publishedAt (YYYY-MM-DD, Veröffentlichungsdatum),\n"
|
||||
"summary (2-4 Sätze Zusammenfassung),\n"
|
||||
'tags (Array der Fachgebiete, z.B. ["dermatology", "oncology"]),\n'
|
||||
"language (Sprache des Originalartikels, z.B. 'en', 'de')"
|
||||
)
|
||||
|
||||
|
||||
def _normalize_news(row: dict, default_tags: list[str]) -> NewsCandidate | None:
|
||||
if not isinstance(row, dict):
|
||||
return None
|
||||
title = str(row.get("title") or "").strip()
|
||||
url = str(row.get("url") or "").strip()
|
||||
if not title:
|
||||
return None
|
||||
tags = row.get("tags") if isinstance(row.get("tags"), list) else list(default_tags)
|
||||
return NewsCandidate(
|
||||
title=title,
|
||||
url=url,
|
||||
source=str(row.get("source") or "").strip(),
|
||||
publishedAt=str(row.get("publishedAt") or "").strip(),
|
||||
summary=str(row.get("summary") or row.get("description") or "").strip()[:800],
|
||||
tags=[str(t).strip().lower() for t in tags if str(t).strip()],
|
||||
language=str(row.get("language") or "en").strip().lower(),
|
||||
)
|
||||
|
||||
|
||||
def search_medical_news(
|
||||
specialties: list[str],
|
||||
limit: int = 30,
|
||||
) -> list[NewsCandidate]:
|
||||
key = os.getenv("OPENAI_API_KEY", "").strip()
|
||||
if not key:
|
||||
raise RuntimeError("OPENAI_API_KEY nicht gesetzt")
|
||||
|
||||
prompt = _build_news_prompt(specialties=specialties, limit=limit)
|
||||
search_model = os.getenv("NEWS_SEARCH_MODEL", "gpt-4o-mini-search-preview").strip()
|
||||
|
||||
client = OpenAI(api_key=key, timeout=80)
|
||||
resp = client.chat.completions.create(
|
||||
model=search_model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"Du bist ein medizinischer Nachrichtenassistent. "
|
||||
"Suche im Internet nach den neuesten medizinischen Nachrichten "
|
||||
"und liefere die Ergebnisse als JSON. "
|
||||
"Gib NUR real existierende Artikel mit funktionierenden Links an."
|
||||
),
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
)
|
||||
txt = ""
|
||||
try:
|
||||
txt = (resp.choices[0].message.content or "").strip()
|
||||
except Exception:
|
||||
txt = ""
|
||||
|
||||
payload = _extract_json_block(txt)
|
||||
rows = payload.get("news") if isinstance(payload.get("news"), list) else []
|
||||
|
||||
out: list[NewsCandidate] = []
|
||||
for row in rows:
|
||||
cand = _normalize_news(row, default_tags=specialties)
|
||||
if cand is not None:
|
||||
out.append(cand)
|
||||
return out
|
||||
Reference in New Issue
Block a user