from __future__ import annotations import json import os import urllib.parse import urllib.request from dataclasses import dataclass from datetime import date from typing import Iterable class SearchProviderConfigError(RuntimeError): pass @dataclass(frozen=True) class SearchResult: title: str snippet: str url: str def _http_get_json(url: str, timeout: int = 10) -> dict: req = urllib.request.Request( url, headers={"User-Agent": "AZA-LiveEventSearch/1.0"}, method="GET", ) with urllib.request.urlopen(req, timeout=timeout) as resp: payload = resp.read().decode("utf-8", errors="ignore") data = json.loads(payload) if not isinstance(data, dict): raise RuntimeError("Ungültige JSON-Antwort der Websuche") return data def _normalize_results(rows: Iterable[dict]) -> list[SearchResult]: out: list[SearchResult] = [] for row in rows: if not isinstance(row, dict): continue title = str(row.get("title") or "").strip() snippet = str(row.get("snippet") or "").strip() url = str(row.get("url") or "").strip() if not url: continue out.append(SearchResult(title=title, snippet=snippet, url=url)) return out def _search_google_cse(query: str, num_results: int) -> list[SearchResult]: key = os.getenv("GOOGLE_CSE_API_KEY", "").strip() cx = os.getenv("GOOGLE_CSE_CX", "").strip() if not key or not cx: raise SearchProviderConfigError("Google CSE nicht konfiguriert (GOOGLE_CSE_API_KEY/GOOGLE_CSE_CX fehlen).") params = urllib.parse.urlencode( { "key": key, "cx": cx, "q": query, "num": max(1, min(int(num_results), 10)), "safe": "off", } ) url = f"https://www.googleapis.com/customsearch/v1?{params}" data = _http_get_json(url, timeout=10) items = data.get("items") if isinstance(data.get("items"), list) else [] return _normalize_results( { "title": it.get("title", ""), "snippet": it.get("snippet", ""), "url": it.get("link", ""), } for it in items if isinstance(it, dict) ) def _search_serpapi(query: str, num_results: int) -> list[SearchResult]: key = os.getenv("SERPAPI_API_KEY", "").strip() if not key: raise SearchProviderConfigError("SerpAPI nicht konfiguriert (SERPAPI_API_KEY fehlt).") params = urllib.parse.urlencode( { "api_key": key, "engine": "google", "q": query, "num": max(1, min(int(num_results), 10)), } ) url = f"https://serpapi.com/search.json?{params}" data = _http_get_json(url, timeout=12) rows = data.get("organic_results") if isinstance(data.get("organic_results"), list) else [] return _normalize_results( { "title": it.get("title", ""), "snippet": it.get("snippet", ""), "url": it.get("link", ""), } for it in rows if isinstance(it, dict) ) def _search_bing(query: str, num_results: int) -> list[SearchResult]: key = os.getenv("BING_API_KEY", "").strip() if not key: raise SearchProviderConfigError("Bing Web Search nicht konfiguriert (BING_API_KEY fehlt).") params = urllib.parse.urlencode( { "q": query, "count": max(1, min(int(num_results), 10)), "textDecorations": "false", "textFormat": "Raw", } ) url = f"https://api.bing.microsoft.com/v7.0/search?{params}" req = urllib.request.Request( url, headers={ "User-Agent": "AZA-LiveEventSearch/1.0", "Ocp-Apim-Subscription-Key": key, }, method="GET", ) with urllib.request.urlopen(req, timeout=10) as resp: payload = resp.read().decode("utf-8", errors="ignore") data = json.loads(payload) rows = (((data or {}).get("webPages") or {}).get("value")) if isinstance(data, dict) else [] if not isinstance(rows, list): rows = [] return _normalize_results( { "title": it.get("name", ""), "snippet": it.get("snippet", ""), "url": it.get("url", ""), } for it in rows if isinstance(it, dict) ) def search_web(query: str, num_results: int = 8) -> list[SearchResult]: provider = os.getenv("WEBSEARCH_PROVIDER", "").strip().lower() if provider == "google_cse": return _search_google_cse(query, num_results=num_results) if provider == "serpapi": return _search_serpapi(query, num_results=num_results) if provider == "bing": return _search_bing(query, num_results=num_results) raise SearchProviderConfigError( "Web Search nicht konfiguriert. Setze WEBSEARCH_PROVIDER auf google_cse, serpapi oder bing." ) def build_queries( specialty: str, regions: list[str], from_date: date, to_date: date, lang: str = "de", max_queries: int = 10, ) -> list[str]: spec = (specialty or "medical").strip() years: list[str] = [] for y in range(from_date.year, to_date.year + 1): years.append(str(y)) year_hint = " ".join(years[:3]) if years else str(from_date.year) region_tokens: list[str] = [] region_norm = {r.strip().upper() for r in regions if str(r).strip()} if "CH" in region_norm: region_tokens.extend(["Schweiz", "Suisse", "Switzerland"]) if "EU" in region_norm: region_tokens.extend(["Europa", "Europe"]) if "WORLD" in region_norm: region_tokens.extend(["global", "international"]) if "US" in region_norm: region_tokens.extend(["USA", "United States"]) if "CA" in region_norm: region_tokens.extend(["Canada"]) if not region_tokens: region_tokens.extend(["Europe", "Switzerland"]) loc_hint = " ".join(dict.fromkeys(region_tokens)) lang_hint = "Deutsch" if str(lang).lower().startswith("de") else "English" base = [ f"{spec} congress {year_hint} {loc_hint} dates registration", f"{spec} conference {year_hint} {loc_hint} official event page", f"{spec} course {year_hint} {loc_hint} CME", f"{spec} Weiterbildung {year_hint} {loc_hint}", f"{spec} Fortbildung {year_hint} {loc_hint}", f"{spec} Kongress {year_hint} {loc_hint}", f"{spec} congress {year_hint} site:org", f"{spec} symposium {year_hint} {loc_hint}", f"{spec} annual meeting {year_hint} {loc_hint}", f"{spec} event {year_hint} {loc_hint} {lang_hint}", ] out: list[str] = [] for q in base: q_clean = " ".join(str(q).split()) if q_clean and q_clean not in out: out.append(q_clean) if len(out) >= max(6, min(int(max_queries), 12)): break return out