Files
aza/AzA march 2026 - Kopie (3)/services/live_event_search.py
2026-03-30 07:59:11 +02:00

209 lines
6.7 KiB
Python

from __future__ import annotations
import json
import os
import urllib.parse
import urllib.request
from dataclasses import dataclass
from datetime import date
from typing import Iterable
class SearchProviderConfigError(RuntimeError):
pass
@dataclass(frozen=True)
class SearchResult:
title: str
snippet: str
url: str
def _http_get_json(url: str, timeout: int = 10) -> dict:
req = urllib.request.Request(
url,
headers={"User-Agent": "AZA-LiveEventSearch/1.0"},
method="GET",
)
with urllib.request.urlopen(req, timeout=timeout) as resp:
payload = resp.read().decode("utf-8", errors="ignore")
data = json.loads(payload)
if not isinstance(data, dict):
raise RuntimeError("Ungültige JSON-Antwort der Websuche")
return data
def _normalize_results(rows: Iterable[dict]) -> list[SearchResult]:
out: list[SearchResult] = []
for row in rows:
if not isinstance(row, dict):
continue
title = str(row.get("title") or "").strip()
snippet = str(row.get("snippet") or "").strip()
url = str(row.get("url") or "").strip()
if not url:
continue
out.append(SearchResult(title=title, snippet=snippet, url=url))
return out
def _search_google_cse(query: str, num_results: int) -> list[SearchResult]:
key = os.getenv("GOOGLE_CSE_API_KEY", "").strip()
cx = os.getenv("GOOGLE_CSE_CX", "").strip()
if not key or not cx:
raise SearchProviderConfigError("Google CSE nicht konfiguriert (GOOGLE_CSE_API_KEY/GOOGLE_CSE_CX fehlen).")
params = urllib.parse.urlencode(
{
"key": key,
"cx": cx,
"q": query,
"num": max(1, min(int(num_results), 10)),
"safe": "off",
}
)
url = f"https://www.googleapis.com/customsearch/v1?{params}"
data = _http_get_json(url, timeout=10)
items = data.get("items") if isinstance(data.get("items"), list) else []
return _normalize_results(
{
"title": it.get("title", ""),
"snippet": it.get("snippet", ""),
"url": it.get("link", ""),
}
for it in items
if isinstance(it, dict)
)
def _search_serpapi(query: str, num_results: int) -> list[SearchResult]:
key = os.getenv("SERPAPI_API_KEY", "").strip()
if not key:
raise SearchProviderConfigError("SerpAPI nicht konfiguriert (SERPAPI_API_KEY fehlt).")
params = urllib.parse.urlencode(
{
"api_key": key,
"engine": "google",
"q": query,
"num": max(1, min(int(num_results), 10)),
}
)
url = f"https://serpapi.com/search.json?{params}"
data = _http_get_json(url, timeout=12)
rows = data.get("organic_results") if isinstance(data.get("organic_results"), list) else []
return _normalize_results(
{
"title": it.get("title", ""),
"snippet": it.get("snippet", ""),
"url": it.get("link", ""),
}
for it in rows
if isinstance(it, dict)
)
def _search_bing(query: str, num_results: int) -> list[SearchResult]:
key = os.getenv("BING_API_KEY", "").strip()
if not key:
raise SearchProviderConfigError("Bing Web Search nicht konfiguriert (BING_API_KEY fehlt).")
params = urllib.parse.urlencode(
{
"q": query,
"count": max(1, min(int(num_results), 10)),
"textDecorations": "false",
"textFormat": "Raw",
}
)
url = f"https://api.bing.microsoft.com/v7.0/search?{params}"
req = urllib.request.Request(
url,
headers={
"User-Agent": "AZA-LiveEventSearch/1.0",
"Ocp-Apim-Subscription-Key": key,
},
method="GET",
)
with urllib.request.urlopen(req, timeout=10) as resp:
payload = resp.read().decode("utf-8", errors="ignore")
data = json.loads(payload)
rows = (((data or {}).get("webPages") or {}).get("value")) if isinstance(data, dict) else []
if not isinstance(rows, list):
rows = []
return _normalize_results(
{
"title": it.get("name", ""),
"snippet": it.get("snippet", ""),
"url": it.get("url", ""),
}
for it in rows
if isinstance(it, dict)
)
def search_web(query: str, num_results: int = 8) -> list[SearchResult]:
provider = os.getenv("WEBSEARCH_PROVIDER", "").strip().lower()
if provider == "google_cse":
return _search_google_cse(query, num_results=num_results)
if provider == "serpapi":
return _search_serpapi(query, num_results=num_results)
if provider == "bing":
return _search_bing(query, num_results=num_results)
raise SearchProviderConfigError(
"Web Search nicht konfiguriert. Setze WEBSEARCH_PROVIDER auf google_cse, serpapi oder bing."
)
def build_queries(
specialty: str,
regions: list[str],
from_date: date,
to_date: date,
lang: str = "de",
max_queries: int = 10,
) -> list[str]:
spec = (specialty or "medical").strip()
years: list[str] = []
for y in range(from_date.year, to_date.year + 1):
years.append(str(y))
year_hint = " ".join(years[:3]) if years else str(from_date.year)
region_tokens: list[str] = []
region_norm = {r.strip().upper() for r in regions if str(r).strip()}
if "CH" in region_norm:
region_tokens.extend(["Schweiz", "Suisse", "Switzerland"])
if "EU" in region_norm:
region_tokens.extend(["Europa", "Europe"])
if "WORLD" in region_norm:
region_tokens.extend(["global", "international"])
if "US" in region_norm:
region_tokens.extend(["USA", "United States"])
if "CA" in region_norm:
region_tokens.extend(["Canada"])
if not region_tokens:
region_tokens.extend(["Europe", "Switzerland"])
loc_hint = " ".join(dict.fromkeys(region_tokens))
lang_hint = "Deutsch" if str(lang).lower().startswith("de") else "English"
base = [
f"{spec} congress {year_hint} {loc_hint} dates registration",
f"{spec} conference {year_hint} {loc_hint} official event page",
f"{spec} course {year_hint} {loc_hint} CME",
f"{spec} Weiterbildung {year_hint} {loc_hint}",
f"{spec} Fortbildung {year_hint} {loc_hint}",
f"{spec} Kongress {year_hint} {loc_hint}",
f"{spec} congress {year_hint} site:org",
f"{spec} symposium {year_hint} {loc_hint}",
f"{spec} annual meeting {year_hint} {loc_hint}",
f"{spec} event {year_hint} {loc_hint} {lang_hint}",
]
out: list[str] = []
for q in base:
q_clean = " ".join(str(q).split())
if q_clean and q_clean not in out:
out.append(q_clean)
if len(out) >= max(6, min(int(max_queries), 12)):
break
return out