209 lines
6.7 KiB
Python
209 lines
6.7 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import urllib.parse
|
||
|
|
import urllib.request
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from datetime import date
|
||
|
|
from typing import Iterable
|
||
|
|
|
||
|
|
|
||
|
|
class SearchProviderConfigError(RuntimeError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class SearchResult:
|
||
|
|
title: str
|
||
|
|
snippet: str
|
||
|
|
url: str
|
||
|
|
|
||
|
|
|
||
|
|
def _http_get_json(url: str, timeout: int = 10) -> dict:
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
headers={"User-Agent": "AZA-LiveEventSearch/1.0"},
|
||
|
|
method="GET",
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||
|
|
payload = resp.read().decode("utf-8", errors="ignore")
|
||
|
|
data = json.loads(payload)
|
||
|
|
if not isinstance(data, dict):
|
||
|
|
raise RuntimeError("Ungültige JSON-Antwort der Websuche")
|
||
|
|
return data
|
||
|
|
|
||
|
|
|
||
|
|
def _normalize_results(rows: Iterable[dict]) -> list[SearchResult]:
|
||
|
|
out: list[SearchResult] = []
|
||
|
|
for row in rows:
|
||
|
|
if not isinstance(row, dict):
|
||
|
|
continue
|
||
|
|
title = str(row.get("title") or "").strip()
|
||
|
|
snippet = str(row.get("snippet") or "").strip()
|
||
|
|
url = str(row.get("url") or "").strip()
|
||
|
|
if not url:
|
||
|
|
continue
|
||
|
|
out.append(SearchResult(title=title, snippet=snippet, url=url))
|
||
|
|
return out
|
||
|
|
|
||
|
|
|
||
|
|
def _search_google_cse(query: str, num_results: int) -> list[SearchResult]:
|
||
|
|
key = os.getenv("GOOGLE_CSE_API_KEY", "").strip()
|
||
|
|
cx = os.getenv("GOOGLE_CSE_CX", "").strip()
|
||
|
|
if not key or not cx:
|
||
|
|
raise SearchProviderConfigError("Google CSE nicht konfiguriert (GOOGLE_CSE_API_KEY/GOOGLE_CSE_CX fehlen).")
|
||
|
|
params = urllib.parse.urlencode(
|
||
|
|
{
|
||
|
|
"key": key,
|
||
|
|
"cx": cx,
|
||
|
|
"q": query,
|
||
|
|
"num": max(1, min(int(num_results), 10)),
|
||
|
|
"safe": "off",
|
||
|
|
}
|
||
|
|
)
|
||
|
|
url = f"https://www.googleapis.com/customsearch/v1?{params}"
|
||
|
|
data = _http_get_json(url, timeout=10)
|
||
|
|
items = data.get("items") if isinstance(data.get("items"), list) else []
|
||
|
|
return _normalize_results(
|
||
|
|
{
|
||
|
|
"title": it.get("title", ""),
|
||
|
|
"snippet": it.get("snippet", ""),
|
||
|
|
"url": it.get("link", ""),
|
||
|
|
}
|
||
|
|
for it in items
|
||
|
|
if isinstance(it, dict)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _search_serpapi(query: str, num_results: int) -> list[SearchResult]:
|
||
|
|
key = os.getenv("SERPAPI_API_KEY", "").strip()
|
||
|
|
if not key:
|
||
|
|
raise SearchProviderConfigError("SerpAPI nicht konfiguriert (SERPAPI_API_KEY fehlt).")
|
||
|
|
params = urllib.parse.urlencode(
|
||
|
|
{
|
||
|
|
"api_key": key,
|
||
|
|
"engine": "google",
|
||
|
|
"q": query,
|
||
|
|
"num": max(1, min(int(num_results), 10)),
|
||
|
|
}
|
||
|
|
)
|
||
|
|
url = f"https://serpapi.com/search.json?{params}"
|
||
|
|
data = _http_get_json(url, timeout=12)
|
||
|
|
rows = data.get("organic_results") if isinstance(data.get("organic_results"), list) else []
|
||
|
|
return _normalize_results(
|
||
|
|
{
|
||
|
|
"title": it.get("title", ""),
|
||
|
|
"snippet": it.get("snippet", ""),
|
||
|
|
"url": it.get("link", ""),
|
||
|
|
}
|
||
|
|
for it in rows
|
||
|
|
if isinstance(it, dict)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _search_bing(query: str, num_results: int) -> list[SearchResult]:
|
||
|
|
key = os.getenv("BING_API_KEY", "").strip()
|
||
|
|
if not key:
|
||
|
|
raise SearchProviderConfigError("Bing Web Search nicht konfiguriert (BING_API_KEY fehlt).")
|
||
|
|
params = urllib.parse.urlencode(
|
||
|
|
{
|
||
|
|
"q": query,
|
||
|
|
"count": max(1, min(int(num_results), 10)),
|
||
|
|
"textDecorations": "false",
|
||
|
|
"textFormat": "Raw",
|
||
|
|
}
|
||
|
|
)
|
||
|
|
url = f"https://api.bing.microsoft.com/v7.0/search?{params}"
|
||
|
|
req = urllib.request.Request(
|
||
|
|
url,
|
||
|
|
headers={
|
||
|
|
"User-Agent": "AZA-LiveEventSearch/1.0",
|
||
|
|
"Ocp-Apim-Subscription-Key": key,
|
||
|
|
},
|
||
|
|
method="GET",
|
||
|
|
)
|
||
|
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
||
|
|
payload = resp.read().decode("utf-8", errors="ignore")
|
||
|
|
data = json.loads(payload)
|
||
|
|
rows = (((data or {}).get("webPages") or {}).get("value")) if isinstance(data, dict) else []
|
||
|
|
if not isinstance(rows, list):
|
||
|
|
rows = []
|
||
|
|
return _normalize_results(
|
||
|
|
{
|
||
|
|
"title": it.get("name", ""),
|
||
|
|
"snippet": it.get("snippet", ""),
|
||
|
|
"url": it.get("url", ""),
|
||
|
|
}
|
||
|
|
for it in rows
|
||
|
|
if isinstance(it, dict)
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def search_web(query: str, num_results: int = 8) -> list[SearchResult]:
|
||
|
|
provider = os.getenv("WEBSEARCH_PROVIDER", "").strip().lower()
|
||
|
|
if provider == "google_cse":
|
||
|
|
return _search_google_cse(query, num_results=num_results)
|
||
|
|
if provider == "serpapi":
|
||
|
|
return _search_serpapi(query, num_results=num_results)
|
||
|
|
if provider == "bing":
|
||
|
|
return _search_bing(query, num_results=num_results)
|
||
|
|
raise SearchProviderConfigError(
|
||
|
|
"Web Search nicht konfiguriert. Setze WEBSEARCH_PROVIDER auf google_cse, serpapi oder bing."
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def build_queries(
|
||
|
|
specialty: str,
|
||
|
|
regions: list[str],
|
||
|
|
from_date: date,
|
||
|
|
to_date: date,
|
||
|
|
lang: str = "de",
|
||
|
|
max_queries: int = 10,
|
||
|
|
) -> list[str]:
|
||
|
|
spec = (specialty or "medical").strip()
|
||
|
|
years: list[str] = []
|
||
|
|
for y in range(from_date.year, to_date.year + 1):
|
||
|
|
years.append(str(y))
|
||
|
|
year_hint = " ".join(years[:3]) if years else str(from_date.year)
|
||
|
|
|
||
|
|
region_tokens: list[str] = []
|
||
|
|
region_norm = {r.strip().upper() for r in regions if str(r).strip()}
|
||
|
|
if "CH" in region_norm:
|
||
|
|
region_tokens.extend(["Schweiz", "Suisse", "Switzerland"])
|
||
|
|
if "EU" in region_norm:
|
||
|
|
region_tokens.extend(["Europa", "Europe"])
|
||
|
|
if "WORLD" in region_norm:
|
||
|
|
region_tokens.extend(["global", "international"])
|
||
|
|
if "US" in region_norm:
|
||
|
|
region_tokens.extend(["USA", "United States"])
|
||
|
|
if "CA" in region_norm:
|
||
|
|
region_tokens.extend(["Canada"])
|
||
|
|
if not region_tokens:
|
||
|
|
region_tokens.extend(["Europe", "Switzerland"])
|
||
|
|
|
||
|
|
loc_hint = " ".join(dict.fromkeys(region_tokens))
|
||
|
|
lang_hint = "Deutsch" if str(lang).lower().startswith("de") else "English"
|
||
|
|
|
||
|
|
base = [
|
||
|
|
f"{spec} congress {year_hint} {loc_hint} dates registration",
|
||
|
|
f"{spec} conference {year_hint} {loc_hint} official event page",
|
||
|
|
f"{spec} course {year_hint} {loc_hint} CME",
|
||
|
|
f"{spec} Weiterbildung {year_hint} {loc_hint}",
|
||
|
|
f"{spec} Fortbildung {year_hint} {loc_hint}",
|
||
|
|
f"{spec} Kongress {year_hint} {loc_hint}",
|
||
|
|
f"{spec} congress {year_hint} site:org",
|
||
|
|
f"{spec} symposium {year_hint} {loc_hint}",
|
||
|
|
f"{spec} annual meeting {year_hint} {loc_hint}",
|
||
|
|
f"{spec} event {year_hint} {loc_hint} {lang_hint}",
|
||
|
|
]
|
||
|
|
out: list[str] = []
|
||
|
|
for q in base:
|
||
|
|
q_clean = " ".join(str(q).split())
|
||
|
|
if q_clean and q_clean not in out:
|
||
|
|
out.append(q_clean)
|
||
|
|
if len(out) >= max(6, min(int(max_queries), 12)):
|
||
|
|
break
|
||
|
|
return out
|
||
|
|
|