Files
aza/AzA march 2026 - Kopie (3)/congress_window.py
2026-03-30 07:59:11 +02:00

1943 lines
76 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
Kongresse & Weiterbildungen Suchfenster
==========================================
SpecialtyService FMH/SIWF Facharzttitel (online + cache + fallback)
MajorCongressRegistry kuratierte Kongresse aus data/major_congresses.json
EventDatabase persistente lokale Event-DB (cache/events_db.json)
SeedCrawler leichtgewichtiger HTML-Crawler für Seed-Quellen
CongressSearchEngine GPT-Websuche, JSON-Extraktion, Merge/Dedup
CongressWindow Tkinter-UI mit Karten-Layout
cache/events_db.json schema:
{
"last_refresh": "YYYY-MM-DDTHH:MM:SS", // ISO UTC
"events": [
{
"title": str,
"start_date": "YYYY-MM-DD" or "",
"end_date": "YYYY-MM-DD" or "",
"location": str,
"format": "onsite|online|hybrid|unknown",
"cme": str,
"type": "congress|symposium|workshop|course|webinar|unknown",
"description": str,
"url": str,
"specialties": [str],
"source": "registry|seed|search"
}, ...
]
}
"""
from __future__ import annotations
import json
import os
import re
import ssl
import threading
import time
import webbrowser
from datetime import date, datetime, timedelta
from html.parser import HTMLParser
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
import tkinter as tk
from tkinter import ttk
from aza_config import get_writable_data_dir
_BUNDLE_DIR = Path(__file__).resolve().parent
_BUNDLE_DATA = _BUNDLE_DIR / "data"
_WRITABLE_DATA = Path(get_writable_data_dir()) / "data"
_CACHE = Path(get_writable_data_dir()) / "cache"
# ═══════════════════════════════════════════════════════════════════════════
# SpecialtyService
# ═══════════════════════════════════════════════════════════════════════════
_SPEC_CACHE = _WRITABLE_DATA / "fmh_specialties_cache.json"
_FALLBACK: List[str] = [
"Allergologie und klinische Immunologie",
"Allgemeine Innere Medizin",
"Anästhesiologie",
"Angiologie",
"Arbeitsmedizin",
"Chirurgie",
"Dermatologie und Venerologie",
"Endokrinologie-Diabetologie",
"Gastroenterologie",
"Gefässchirurgie",
"Gynäkologie und Geburtshilfe",
"Hämatologie",
"Handchirurgie",
"Herz- und thorakale Gefässchirurgie",
"Infektiologie",
"Intensivmedizin",
"Kardiologie",
"Kinder- und Jugendmedizin",
"Kinder- und Jugendpsychiatrie und -psychotherapie",
"Kinderchirurgie",
"Klinische Pharmakologie und Toxikologie",
"Medizinische Genetik",
"Medizinische Onkologie",
"Mund-, Kiefer- und Gesichtschirurgie",
"Nephrologie",
"Neurochirurgie",
"Neurologie",
"Neuropathologie",
"Nuklearmedizin",
"Ophthalmologie",
"Orthopädische Chirurgie und Traumatologie des Bewegungsapparates",
"Oto-Rhino-Laryngologie",
"Pathologie",
"Pharmazeutische Medizin",
"Physikalische Medizin und Rehabilitation",
"Plastische, Rekonstruktive und Ästhetische Chirurgie",
"Pneumologie",
"Prävention und Gesundheitswesen",
"Psychiatrie und Psychotherapie",
"Radiologie",
"Radio-Onkologie / Strahlentherapie",
"Rechtsmedizin",
"Rheumatologie",
"Thoraxchirurgie",
"Tropen- und Reisemedizin",
"Urologie",
"Viszeralchirurgie",
]
class _FMHParser(HTMLParser):
def __init__(self):
super().__init__()
self._in = False
self._d = 0
self._buf = ""
self.titles: List[str] = []
def handle_starttag(self, tag, attrs):
if tag in ("li", "a"):
self._in = True
self._d += 1
def handle_endtag(self, tag):
if tag in ("li", "a") and self._in:
self._d -= 1
if self._d <= 0:
self._in = False
self._d = 0
t = self._buf.strip()
if t and len(t) > 4 and not t.startswith("http"):
self.titles.append(t)
self._buf = ""
def handle_data(self, data):
if self._in:
self._buf += data
def _fetch_fmh() -> Optional[List[str]]:
try:
import urllib.request
ctx = ssl.create_default_context()
for url in [
"https://www.siwf.ch/weiterbildung/facharzttitel-und-schwerpunkte.cfm",
"https://www.fmh.ch/bildung-siwf/fachgebiete/facharzttitel-und-schwerpunkte.html",
]:
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=12, context=ctx) as resp:
html = resp.read().decode("utf-8", errors="replace")
p = _FMHParser()
p.feed(html)
kw = ["medizin", "chirurgie", "logie", "iatrie", "pathie"]
out = [t for t in p.titles
if any(k in t.lower() for k in kw)
or "Ophthalmologie" in t or "Anästhes" in t
or "Rechtsmedizin" in t]
if len(out) >= 15:
return sorted(set(out))
except Exception:
continue
except Exception:
pass
return None
class SpecialtyService:
def __init__(self):
self._titles: List[str] = []
self._load()
def _load(self):
if _SPEC_CACHE.is_file():
try:
with open(_SPEC_CACHE, "r", encoding="utf-8") as f:
d = json.load(f)
if isinstance(d, list) and len(d) >= 10:
self._titles = d
return
except Exception:
pass
self._titles = list(_FALLBACK)
def _save(self, titles):
try:
_WRITABLE_DATA.mkdir(parents=True, exist_ok=True)
with open(_SPEC_CACHE, "w", encoding="utf-8") as f:
json.dump(titles, f, ensure_ascii=False, indent=2)
except Exception:
pass
def get_titles(self) -> List[str]:
return list(self._titles) if self._titles else list(_FALLBACK)
def refresh(self, callback=None):
def _j():
r = _fetch_fmh()
if r and len(r) >= 15:
self._titles = r
self._save(r)
if callback:
callback(True, len(r))
elif callback:
callback(False, 0)
threading.Thread(target=_j, daemon=True).start()
_spec_svc = SpecialtyService()
# ═══════════════════════════════════════════════════════════════════════════
# MajorCongressRegistry
# ═══════════════════════════════════════════════════════════════════════════
_CONGRESS_JSON = _BUNDLE_DATA / "major_congresses.json"
class MajorCongressRegistry:
_cache: Optional[Dict[str, List[Dict[str, str]]]] = None
@classmethod
def _load(cls) -> Dict[str, List[Dict[str, str]]]:
if cls._cache is not None:
return cls._cache
if _CONGRESS_JSON.is_file():
try:
with open(_CONGRESS_JSON, "r", encoding="utf-8") as f:
cls._cache = json.load(f)
return cls._cache
except Exception:
pass
cls._cache = {}
return cls._cache
@classmethod
def get_for(cls, selected: Set[str]) -> List[Dict[str, str]]:
data = cls._load()
out: List[Dict[str, str]] = []
seen: Set[str] = set()
for spec in sorted(selected):
for key, entries in data.items():
if key.lower() in spec.lower() or spec.lower() in key.lower():
for e in entries:
nk = e["name"].lower()
if nk not in seen:
seen.add(nk)
out.append({**e, "specialty": key})
return out
@classmethod
def names_for(cls, selected: Set[str]) -> List[str]:
return [e["name"] for e in cls.get_for(selected)]
@classmethod
def as_db_events(cls, selected: Set[str]) -> List[Dict[str, Any]]:
out = []
for e in cls.get_for(selected):
out.append({
"title": e["name"],
"start_date": "", "end_date": "",
"location": "", "format": "unknown",
"cme": "", "type": "congress",
"description": e.get("org", ""),
"url": e["url"],
"specialties": [e.get("specialty", "")],
"source": "registry",
"_org": e.get("org", ""),
})
return out
# ═══════════════════════════════════════════════════════════════════════════
# Utility functions
# ═══════════════════════════════════════════════════════════════════════════
_TRACKING_PARAMS = {"utm_source", "utm_medium", "utm_campaign", "utm_term",
"utm_content", "fbclid", "gclid", "mc_cid", "mc_eid"}
def _canonical_url(raw: str) -> str:
u = raw.strip().rstrip(".,;)>").lstrip("<")
if u.startswith("www."):
u = "https://" + u
try:
p = urlparse(u)
qs = parse_qs(p.query, keep_blank_values=False)
cleaned = {k: v for k, v in qs.items() if k.lower() not in _TRACKING_PARAMS}
new_q = urlencode(cleaned, doseq=True)
path = p.path.rstrip("/") or "/"
return urlunparse((p.scheme.lower(), p.netloc.lower(), path, p.params, new_q, ""))
except Exception:
return u
def _clean_url(raw: str) -> str:
u = raw.strip().rstrip(".,;)>").lstrip("<")
if u.startswith("www."):
u = "https://" + u
return u
def _valid_url(u: str) -> bool:
try:
p = urlparse(u)
return p.scheme in ("http", "https") and bool(p.netloc)
except Exception:
return False
def _is_url_reachable(url: str) -> bool:
if not url or not _valid_url(url):
return False
try:
import urllib.request
ctx = ssl.create_default_context()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
}
req = urllib.request.Request(url, method="HEAD", headers=headers)
with urllib.request.urlopen(req, timeout=7, context=ctx) as resp:
return resp.status < 400
except Exception as e:
err = str(e)
if "405" in err or "403" in err:
try:
import urllib.request as _ur
req2 = _ur.Request(url, headers=headers)
with _ur.urlopen(req2, timeout=8, context=ctx) as resp2:
resp2.read(1024)
return resp2.status < 400
except Exception:
return False
return False
def _validate_search_event_url(ev: Dict[str, Any]) -> None:
url = ev.get("url", "")
if not url:
return
# Robustere Link-Prüfung: wenn möglich finale Event-URL auflösen.
try:
from services.link_verify import verify_event_url # type: ignore
v = verify_event_url(url, ev.get("title", ""))
if v.finalUrl:
ev["url"] = v.finalUrl
url = v.finalUrl
if (v.httpStatus and v.httpStatus >= 400) or v.httpStatus == 0:
ev["url"] = ""
return
# Registry-Homepages sind oft nicht direkt event-spezifisch:
# dann lieber Google-Suche statt potenziell falschem Link anzeigen.
if ev.get("source") == "registry" and not bool(v.isDirectEventPage):
ev["url"] = ""
return
except Exception:
pass
html = _fetch_page(url, timeout=8)
if html is None:
# Kein stabil erreichbarer Inhalt: Link nicht anzeigen.
# Der Event bleibt sichtbar und bekommt automatisch den Google-Fallback.
ev["url"] = ""
return
extracted = _extract_event_from_html(html, url) or {}
if extracted.get("location") and not ev.get("location"):
ev["location"] = extracted.get("location", "")
if extracted.get("format") and ev.get("format", "unknown") == "unknown":
ev["format"] = extracted.get("format", "unknown")
extracted_sd = extracted.get("start_date", "") or ""
extracted_score = int(extracted.get("_date_score", -999))
extracted_conf = extracted.get("_date_confidence", "low")
if extracted_conf:
ev["_date_confidence"] = extracted_conf
sd = ev.get("start_date", "")
if not sd:
if extracted_sd and extracted_score >= 2:
ev["start_date"] = extracted_sd
if extracted.get("end_date"):
ev["end_date"] = extracted.get("end_date", "")
return
if sd:
page_dates = _DATE_RE.findall(html[:30000])
found_on_page = False
ev_date = _parse_date(sd)
if ev_date:
for group in page_dates:
for raw in group:
if not raw:
continue
pd = _parse_date(raw)
if pd and pd == ev_date:
found_on_page = True
break
if pd and abs((pd - ev_date).days) <= 2:
found_on_page = True
break
if found_on_page:
break
if not found_on_page:
year_str = str(ev_date.year)
month_found = False
for mname, mnum in _MONTH_MAP.items():
if mnum == ev_date.month and mname in html[:30000].lower():
month_found = True
break
if not (year_str in html[:30000] and month_found):
# Bestehendes Datum passt nicht zur Seite -> nur ersetzen,
# wenn eine kontextstarke Alternative gefunden wurde.
if extracted_sd and extracted_score >= 3:
ev["start_date"] = extracted_sd
ev["end_date"] = extracted.get("end_date", "") or ""
else:
# Unsichere Daten nicht weiter als faktisches Eventdatum zeigen.
ev["start_date"] = ""
ev["end_date"] = ""
def _norm_title(t: str) -> str:
return re.sub(r"[^a-z0-9]", "", t.lower())
_MONTH_MAP = {
"january": 1, "jan": 1, "januar": 1,
"february": 2, "feb": 2, "februar": 2,
"march": 3, "mar": 3, "märz": 3,
"april": 4, "apr": 4,
"may": 5, "mai": 5,
"june": 6, "jun": 6, "juni": 6,
"july": 7, "jul": 7, "juli": 7,
"august": 8, "aug": 8,
"september": 9, "sep": 9, "sept": 9,
"october": 10, "oct": 10, "oktober": 10,
"november": 11, "nov": 11,
"december": 12, "dec": 12, "dezember": 12,
}
def _parse_date(s: str) -> Optional[date]:
if not s:
return None
s = s.strip()
s = re.sub(r"(?<=\d)[\s]*(?:|-|to|bis)[\s]*\d{1,2}\.?(?=\s+[A-Za-zÄÖÜäöü])", "", s).strip()
for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%B %d, %Y", "%B %d %Y",
"%d %B %Y", "%b %d, %Y", "%b %d %Y", "%d %b %Y"):
try:
return datetime.strptime(s, fmt).date()
except Exception:
continue
m = re.match(r"(\d{1,2})\.?\s+(\w+)\s+(\d{4})", s)
if m:
mn = _MONTH_MAP.get(m.group(2).lower())
if mn:
try:
return date(int(m.group(3)), mn, int(m.group(1)))
except ValueError:
pass
m = re.match(r"(\w+)\s+(\d{1,2}),?\s+(\d{4})", s)
if m:
mn = _MONTH_MAP.get(m.group(1).lower())
if mn:
try:
return date(int(m.group(3)), mn, int(m.group(2)))
except ValueError:
pass
m = re.match(r"(\w+)\s+(\d{4})", s)
if m:
mn = _MONTH_MAP.get(m.group(1).lower())
if mn:
try:
return date(int(m.group(2)), mn, 1)
except ValueError:
pass
return None
def _month_label(d: date) -> str:
months = ["", "Januar", "Februar", "März", "April", "Mai", "Juni",
"Juli", "August", "September", "Oktober", "November", "Dezember"]
return f"{months[d.month]} {d.year}"
# ═══════════════════════════════════════════════════════════════════════════
# EventDatabase
# ═══════════════════════════════════════════════════════════════════════════
_DB_FILE = _CACHE / "events_db.json"
class EventDatabase:
def __init__(self):
self._events: List[Dict[str, Any]] = []
self._last_refresh: Optional[str] = None
self._lock = threading.Lock()
self.load()
def load(self) -> List[Dict[str, Any]]:
with self._lock:
if _DB_FILE.is_file():
try:
with open(_DB_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
self._events = data.get("events", [])
self._last_refresh = data.get("last_refresh")
except Exception:
self._events = []
self._last_refresh = None
return list(self._events)
def save(self, events: Optional[List[Dict[str, Any]]] = None):
with self._lock:
if events is not None:
self._events = events
self._last_refresh = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
try:
_CACHE.mkdir(parents=True, exist_ok=True)
payload = {
"last_refresh": self._last_refresh,
"events": self._events,
}
with open(_DB_FILE, "w", encoding="utf-8") as f:
json.dump(payload, f, ensure_ascii=False, indent=1)
except Exception:
pass
def merge_in(self, new_events: List[Dict[str, Any]]):
with self._lock:
seen_keys: Set[str] = set()
source_priority = {"registry": 0, "seed": 1, "search": 2}
combined = list(self._events) + new_events
by_key: Dict[str, Dict[str, Any]] = {}
for ev in combined:
url = ev.get("url", "")
canon = _canonical_url(url) if url else ""
nt = _norm_title(ev.get("title", ""))
sd = ev.get("start_date", "")
key = canon if canon else f"{nt}|{sd}"
if not key:
continue
existing = by_key.get(key)
if existing is None:
by_key[key] = ev
else:
ep = source_priority.get(existing.get("source", "search"), 2)
np = source_priority.get(ev.get("source", "search"), 2)
if np < ep:
by_key[key] = ev
elif np == ep:
if ev.get("start_date") and not existing.get("start_date"):
by_key[key] = ev
elif ev.get("location") and not existing.get("location"):
for field in ("location", "start_date", "end_date",
"cme", "format", "description"):
if ev.get(field) and not existing.get(field):
existing[field] = ev[field]
self._events = list(by_key.values())
def query(self, specialties: Set[str], months: int,
regions: List[str]) -> List[Dict[str, Any]]:
today = date.today()
end = today + timedelta(days=months * 30)
out = []
for ev in self._events:
ev_specs = set(ev.get("specialties", []))
if specialties and not ev_specs.intersection(specialties):
overlap = False
for s in specialties:
for es in ev_specs:
if s.lower() in es.lower() or es.lower() in s.lower():
overlap = True
break
if overlap:
break
if not overlap and ev_specs:
continue
sd = _parse_date(ev.get("start_date", ""))
if sd and sd < today:
continue
if sd and sd > end:
continue
ev["_parsed_start"] = sd
ev["_parsed_end"] = _parse_date(ev.get("end_date", ""))
out.append(ev)
def _sk(e):
d = e.get("_parsed_start")
return (0, d) if d else (1, date.max)
out.sort(key=_sk)
return out
def needs_refresh(self, max_age_days: int = 7) -> bool:
if not self._last_refresh:
return True
try:
lr = datetime.strptime(self._last_refresh, "%Y-%m-%dT%H:%M:%S")
return (datetime.utcnow() - lr).days >= max_age_days
except Exception:
return True
def last_refresh_time(self) -> str:
if not self._last_refresh:
return "nie"
try:
lr = datetime.strptime(self._last_refresh, "%Y-%m-%dT%H:%M:%S")
return lr.strftime("%d.%m.%Y %H:%M")
except Exception:
return self._last_refresh or "nie"
def count(self) -> int:
return len(self._events)
_event_db = EventDatabase()
# ═══════════════════════════════════════════════════════════════════════════
# SeedCrawler
# ═══════════════════════════════════════════════════════════════════════════
_SEED_JSON = _BUNDLE_DATA / "seed_sources.json"
_EVENT_KEYWORDS = re.compile(
r"event|congress|meeting|conference|course|cme|"
r"fortbildung|kongress|symposium|workshop|webinar|"
r"annual|session|hands.on|masterclass",
re.IGNORECASE,
)
_MONTH_NAMES = (
r"January|February|March|April|May|June|July|August|September|October|November|December"
r"|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec"
r"|Januar|Februar|März|Mai|Juni|Juli|August|Oktober|Dezember"
)
_DATE_RE = re.compile(
r"(\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4})"
r"|(\d{4}[./\-]\d{1,2}[./\-]\d{1,2})"
rf"|((?:{_MONTH_NAMES})\s+\d{{1,2}},?\s+\d{{4}})"
rf"|(\d{{1,2}}[\.\s]*(?:|-|to|bis)\s*\d{{1,2}}\.?\s+(?:{_MONTH_NAMES})\s+\d{{4}})"
rf"|(\d{{1,2}}\.?\s+(?:{_MONTH_NAMES})\s+\d{{4}})"
rf"|((?:{_MONTH_NAMES})\s+\d{{4}})",
re.IGNORECASE,
)
_CME_RE = re.compile(r"(CME|EACCME|ECMEC|CPD|AMA\s*PRA|credit)[:\s]*(\d+)?", re.IGNORECASE)
_ONLINE_RE = re.compile(r"\b(online|virtual|webinar|digital)\b", re.IGNORECASE)
_HYBRID_RE = re.compile(r"\b(hybrid)\b", re.IGNORECASE)
_LOC_RE = re.compile(
r"(?:Location|Venue|Ort|Where|Place|Lieu|City|Tagungsort|Veranstaltungsort|"
r"Kongressort|Convention\s*Center|Conference\s*Venue|Austragungsort)"
r"[:\s]+([^\n<]{5,120})",
re.IGNORECASE,
)
_CITY_RE = re.compile(
r"\b(Zürich|Zurich|Bern|Basel|Genf|Geneva|Genève|Lausanne|Luzern|Lugano|St\.\s*Gallen|"
r"Wien|Vienna|Berlin|München|Munich|Hamburg|Frankfurt|Köln|Düsseldorf|Stuttgart|"
r"Paris|Lyon|Marseille|London|Manchester|Birmingham|Edinburgh|"
r"Amsterdam|Rotterdam|Den\s*Haag|Brüssel|Brussels|Bruxelles|"
r"Rom|Rome|Roma|Mailand|Milan|Milano|Florenz|Florence|Bologna|"
r"Madrid|Barcelona|Sevilla|Lissabon|Lisbon|Lisboa|"
r"Prag|Prague|Praha|Budapest|Warschau|Warsaw|Warszawa|Krakau|Krakow|"
r"Kopenhagen|Copenhagen|Stockholm|Oslo|Helsinki|"
r"New\s*York|Chicago|Boston|San\s*Francisco|Los\s*Angeles|Philadelphia|"
r"Washington|Houston|Dallas|Atlanta|Miami|Seattle|Denver|"
r"Toronto|Montréal|Montreal|Vancouver|"
r"Dubai|Abu\s*Dhabi|Singapur|Singapore|Tokio|Tokyo|Seoul|Sydney|Melbourne|"
r"Kapstadt|Cape\s*Town|Johannesburg|São\s*Paulo|Buenos\s*Aires|"
r"Athen|Athens|Istanbul|Bukarest|Bucharest|Dublin|Reykjavik|"
r"Innsbruck|Graz|Salzburg|Davos|Interlaken|Montreux)\b",
re.IGNORECASE,
)
_DATE_POSITIVE_CTX = (
"congress", "kongress", "meeting", "conference", "symposium", "workshop",
"course", "cme", "fortbildung", "weiterbildung", "annual", "jahreskongress",
"event", "veranstaltung",
)
_DATE_NEGATIVE_CTX = (
"deadline", "abstract", "submission", "registr", "early bird", "newsletter",
"published", "posted", "last updated", "copyright", "sponsor", "exhibit",
"accommodation", "hotel", "application", "anmeldung bis", "einsendeschluss",
)
def _html_to_text(snippet: str) -> str:
txt = re.sub(r"<[^>]+>", " ", snippet or "")
txt = re.sub(r"\s+", " ", txt).strip()
return txt
def _title_tokens_for_date_match(title: str) -> List[str]:
toks = [t for t in re.split(r"[^a-z0-9äöüß]+", (title or "").lower()) if len(t) >= 5]
# Häufige Funktionswörter raus, damit echte Event-Begriffe stärker zählen.
stop = {"congress", "kongress", "annual", "meeting", "conference", "society"}
return [t for t in toks if t not in stop][:8]
def _extract_title_year(title: str) -> Optional[int]:
m = re.search(r"\b(20\d{2})\b", title or "")
if not m:
return None
try:
return int(m.group(1))
except Exception:
return None
def _score_date_candidate(raw_html: str, match_start: int, match_end: int,
dt: date, title: str) -> int:
left = max(0, match_start - 260)
right = min(len(raw_html), match_end + 260)
raw_ctx = raw_html[left:right]
ctx = _html_to_text(raw_ctx).lower()
score = 0
# "Groß geschrieben"/prominent approximieren: Datumsnennung in Heading/Time-Umfeld.
if "<h1" in raw_ctx.lower() or "<h2" in raw_ctx.lower() or "<time" in raw_ctx.lower() or "<strong" in raw_ctx.lower():
score += 3
if any(k in ctx for k in _DATE_POSITIVE_CTX):
score += 3
if any(k in ctx for k in _DATE_NEGATIVE_CTX):
score -= 5
t_tokens = _title_tokens_for_date_match(title)
if t_tokens and any(tok in ctx for tok in t_tokens):
score += 3
title_year = _extract_title_year(title)
if title_year is not None:
if dt.year == title_year:
score += 2
else:
score -= 4
today = date.today()
if dt < today:
score -= 6
else:
# leicht bevorzugt: näher liegende zukünftige Events
days = (dt - today).days
if days <= 400:
score += 1
return score
def _pick_best_date_from_html(raw_html: str, title: str) -> Tuple[Optional[date], int]:
best_date: Optional[date] = None
best_score = -999
for m in _DATE_RE.finditer(raw_html[:120000]):
raw = ""
for g in m.groups():
if g:
raw = g
break
if not raw:
continue
pd = _parse_date(raw)
if not pd:
continue
# Extremwerte ignorieren
if pd.year < 2020 or pd.year > (date.today().year + 5):
continue
sc = _score_date_candidate(raw_html, m.start(), m.end(), pd, title)
if sc > best_score or (sc == best_score and best_date is not None and pd < best_date):
best_score = sc
best_date = pd
return best_date, best_score
class _LinkExtractor(HTMLParser):
def __init__(self, base_url: str):
super().__init__()
self._base = base_url
self.links: List[str] = []
self._in_title = False
self._in_h = False
self._title = ""
self._h_text = ""
self.page_title = ""
self.headings: List[str] = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for k, v in attrs:
if k == "href" and v:
full = urljoin(self._base, v)
self.links.append(full)
if tag == "title":
self._in_title = True
self._title = ""
if tag in ("h1", "h2"):
self._in_h = True
self._h_text = ""
def handle_endtag(self, tag):
if tag == "title" and self._in_title:
self._in_title = False
self.page_title = self._title.strip()
if tag in ("h1", "h2") and self._in_h:
self._in_h = False
t = self._h_text.strip()
if t and len(t) > 3:
self.headings.append(t)
def handle_data(self, data):
if self._in_title:
self._title += data
if self._in_h:
self._h_text += data
def _fetch_page(url: str, timeout: int = 11) -> Optional[str]:
try:
import urllib.request
ctx = ssl.create_default_context()
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml",
})
with urllib.request.urlopen(req, timeout=timeout, context=ctx) as resp:
ct = resp.headers.get("Content-Type", "")
if "html" not in ct.lower() and "text" not in ct.lower():
return None
return resp.read(500_000).decode("utf-8", errors="replace")
except Exception:
return None
_CSS_JS_RE = re.compile(r"<\s*(?:style|script)[^>]*>.*?</\s*(?:style|script)\s*>", re.DOTALL | re.IGNORECASE)
_CSS_FRAG_RE = re.compile(r"[{};]\s*\w[\w-]*\s*:\s*\w")
def _is_css_junk(text: str) -> bool:
return bool(_CSS_FRAG_RE.search(text)) or text.strip().startswith(".")
def _extract_event_from_html(html: str, url: str) -> Optional[Dict[str, Any]]:
html = _CSS_JS_RE.sub(" ", html)
parser = _LinkExtractor(url)
try:
parser.feed(html)
except Exception:
pass
title = parser.page_title
if parser.headings:
title = parser.headings[0]
if not title or len(title) < 4:
return None
if _is_css_junk(title):
title = parser.page_title or ""
if not title or len(title) < 4 or _is_css_junk(title):
return None
title = re.sub(r"\s*[\|–—-]\s*$", "", title).strip()
title = re.sub(r"\s+", " ", title)
if len(title) > 150:
title = title[:147] + ""
best_date, best_score = _pick_best_date_from_html(html, title)
start_date = best_date.isoformat() if best_date and best_score >= 2 else ""
cme = ""
cme_evidence = ""
cme_match = _CME_RE.search(html[:20000])
if cme_match:
start = max(0, cme_match.start() - 60)
end_ctx = min(len(html), cme_match.end() + 60)
context = re.sub(r"<[^>]+>", " ", html[start:end_ctx]).strip()
context = re.sub(r"\s+", " ", context)[:120]
cme = cme_match.group(0).strip()
cme_evidence = context
fmt = "unknown"
if _HYBRID_RE.search(html[:10000]):
fmt = "hybrid"
elif _ONLINE_RE.search(html[:10000]):
fmt = "online"
elif start_date:
fmt = "onsite"
loc = ""
loc_m = _LOC_RE.search(html[:30000])
if loc_m:
loc = loc_m.group(1).strip()
loc = re.sub(r"<[^>]+>", "", loc).strip()
loc = re.sub(r"\s+", " ", loc)
if len(loc) > 80:
loc = loc[:77] + ""
if not loc:
text_chunk = re.sub(r"<[^>]+>", " ", html[:20000])
city_m = _CITY_RE.search(text_chunk)
if city_m:
loc = city_m.group(1).strip()
return {
"title": title,
"start_date": start_date,
"end_date": "",
"location": loc,
"format": fmt,
"cme": cme,
"cme_evidence": cme_evidence,
"type": "unknown",
"description": "",
"url": url,
"_date_score": best_score,
"_date_confidence": "high" if best_score >= 6 else ("medium" if best_score >= 2 else "low"),
}
class SeedCrawler:
@staticmethod
def _load_seeds() -> Dict[str, List[Dict[str, str]]]:
if _SEED_JSON.is_file():
try:
with open(_SEED_JSON, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {}
@staticmethod
def crawl_for_specialties(specialties: Set[str],
progress_cb=None) -> List[Dict[str, Any]]:
seeds = SeedCrawler._load_seeds()
relevant_seeds: List[Tuple[str, Dict[str, str]]] = []
for spec in specialties:
for key, entries in seeds.items():
if key.lower() in spec.lower() or spec.lower() in key.lower():
for e in entries:
relevant_seeds.append((key, e))
events: List[Dict[str, Any]] = []
seen_urls: Set[str] = set()
total = len(relevant_seeds)
for idx, (specialty, seed) in enumerate(relevant_seeds):
if progress_cb:
progress_cb(idx + 1, total, seed.get("name", ""))
seed_url = seed.get("url", "")
if not seed_url or not _valid_url(seed_url):
continue
html = _fetch_page(seed_url)
if not html:
continue
parser = _LinkExtractor(seed_url)
try:
parser.feed(html)
except Exception:
continue
event_links = []
for link in parser.links:
if _canonical_url(link) in seen_urls:
continue
if _EVENT_KEYWORDS.search(link):
event_links.append(link)
for link in event_links[:10]:
canon = _canonical_url(link)
if canon in seen_urls:
continue
seen_urls.add(canon)
page_html = _fetch_page(link)
if not page_html:
continue
ev = _extract_event_from_html(page_html, link)
if ev and ev.get("title"):
ev["specialties"] = [specialty]
ev["source"] = "seed"
events.append(ev)
return events
# ═══════════════════════════════════════════════════════════════════════════
# CongressSearchEngine
# ═══════════════════════════════════════════════════════════════════════════
def _validate_cme_batch(events: List[Dict[str, Any]]) -> None:
freq: Dict[str, int] = {}
for ev in events:
c = str(ev.get("cme", "")).strip()
ev["cme"] = c
if c:
num = re.sub(r"\D", "", c)
key = num if num else c
freq[key] = freq.get(key, 0) + 1
threshold = max(2, int(len(events) * 0.15))
suspicious = {v for v, cnt in freq.items() if cnt >= threshold}
for ev in events:
c = ev.get("cme", "")
evidence = ev.get("cme_evidence", "")
if not c:
continue
num = re.sub(r"\D", "", c)
key = num if num else c
if key in suspicious:
ev["cme"] = ""
ev["cme_evidence"] = ""
continue
if not evidence:
ev["cme"] = ""
continue
class CongressSearchEngine:
@staticmethod
def build_prompt(specs: List[str], months: int, regions: List[str],
major_names: List[str]) -> Tuple[str, str]:
spec_text = ", ".join(specs)
today_s = date.today().isoformat()
end_s = (date.today() + timedelta(days=months * 30)).isoformat()
region_text = ", ".join(regions) if regions else "CH, EU"
neighbor_hint = ""
if "CH" in regions:
neighbor_hint = (
"WICHTIG Der Benutzer ist in der Schweiz. Du MUSST gezielt auch in diesen "
"Nachbarländern und Städten nach Kongressen suchen:\n"
" - DEUTSCHLAND: Berlin, München, Hamburg, Frankfurt, Düsseldorf, Köln, Stuttgart, Leipzig, Dresden\n"
" - ÖSTERREICH: Wien, Salzburg, Innsbruck, Graz\n"
" - FRANKREICH: Paris, Lyon, Strasbourg, Marseille, Nizza\n"
" - ITALIEN: Mailand/Milano, Rom/Roma, Bologna, Florenz, Turin\n"
" - SCHWEIZ: Zürich, Bern, Basel, Genf, Lausanne, Luzern, St. Gallen, Davos\n"
"Suche aktiv nach Kongressen in diesen Städten! Mindestens 5 Events aus Nachbarländern.\n"
)
major_block = "\n".join(f" - {n}" for n in major_names) if major_names else "(keine)"
system = (
"Du bist ein weltweiter medizinischer Kongresskalender-Assistent. "
"Du suchst im Internet und gibst Ergebnisse als JSON-Zeilen zurück. "
"JEDE Zeile ist ein eigenständiges JSON-Objekt. KEIN Array-Wrapper. "
"KEIN Markdown. KEIN erklärender Text. NUR JSON-Zeilen."
)
user = (
f"Suche im Internet nach medizinischen Kongressen, Symposien, Workshops, "
f"Masterclasses, CME-Kursen, Fortbildungen, Weiterbildungen, Hands-on-Trainings "
f"und Webinaren WELTWEIT für: {spec_text}.\n"
f"Zeitraum: {today_s} bis {end_s}.\n"
f"Regionen: {region_text}.\n"
f"{neighbor_hint}\n"
f"PFLICHT suche zwingend Datum, Ort und URL für:\n{major_block}\n\n"
f"Ergänze weitere Events: Workshops, CME-Kurse, Masterclasses, Hands-on, "
f"Fortbildungen, Webinare.\n"
f"Suche auch auf DEUTSCH: \"Kongress {spec_text}\", \"Fortbildung {specs[0] if specs else ''}\", "
f"\"Jahrestagung {specs[0] if specs else ''}\", \"Symposium {specs[0] if specs else ''}\".\n"
f"Suche auch auf FRANZÖSISCH: \"congrès\", \"formation continue\".\n"
f"Suche auch auf ITALIENISCH: \"congresso\", \"formazione\".\n\n"
f"Antworte AUSSCHLIESSLICH mit JSON-Zeilen. Eine Zeile pro Event:\n"
f'{{"title":"...","start_date":"YYYY-MM-DD","end_date":"YYYY-MM-DD",'
f'"location":"...","format":"onsite|online|hybrid|unknown",'
f'"cme":"...","cme_evidence":"...",'
f'"type":"congress|symposium|workshop|course|webinar|unknown",'
f'"description":"...","url":"https://...","specialties":["{specs[0] if specs else ""}"]}}\n\n'
f"Regeln:\n"
f"- URL: EXTREM WICHTIG jede URL muss die ECHTE offizielle Event-Seite sein.\n"
f" Öffne die URL gedanklich und prüfe, ob sie zum genannten Event passt.\n"
f" KEINE erfundenen URLs! KEINE generischen Homepages wenn es eine Event-Unterseite gibt.\n"
f" Wenn du dir bei einer URL nicht sicher bist: url=\"\" (lieber leer als falsch).\n"
f"- DATUM + URL müssen zusammenpassen: Das Datum in start_date/end_date MUSS\n"
f" das Datum sein, das auf der verlinkten Webseite steht.\n"
f" NIEMALS ein Datum erfinden oder raten!\n"
f"- URLs immer mit https://.\n"
f"- CME/Credits: EXTREM WICHTIG folge diesen Regeln strikt:\n"
f" 1) Setze cme nur dann, wenn du auf der offiziellen Event-Webseite eine EXPLIZITE Angabe findest\n"
f" wie \"CME: 12 Credits\", \"EACCME: 18\", \"AMA PRA Category 1: 25\", \"CPD: 8 points\".\n"
f" 2) Kopiere den exakten Originaltext in cme_evidence (z.B. \"EACCME®: 18 European CME credits\").\n"
f" 3) Wenn die Event-Seite KEINE explizite CME/Credit-Angabe enthält: cme=\"\" und cme_evidence=\"\".\n"
f" 4) NIEMALS eine CME-Zahl schätzen, raten oder erfinden.\n"
f" 5) Im Zweifelsfall: cme=\"\" und cme_evidence=\"\". Lieber weglassen als falsch.\n"
f"- Wenn Format unbekannt: \"unknown\".\n"
f"- DATUM: EXTREM WICHTIG folge diesen Regeln strikt:\n"
f" 1) Das Datum MUSS direkt von der offiziellen Event-Webseite stammen.\n"
f" 2) Öffne die Event-URL und lies das Datum von dort ab.\n"
f" 3) NIEMALS ein Datum schätzen, raten oder aus dem Gedächtnis nehmen!\n"
f" 4) Wenn auf der Webseite z.B. \"August 21-23, 2026\" steht, dann start_date=\"2026-08-21\".\n"
f" 5) Wenn du das Datum auf der Webseite NICHT findest: start_date=\"\" und end_date=\"\".\n"
f" 6) Lieber kein Datum als ein falsches Datum!\n"
f"- ORT: Suche IMMER gezielt nach dem Veranstaltungsort (Stadt, Land).\n"
f" Gib location als \"Stadt, Land\" an (z.B. \"Wien, Österreich\", \"Barcelona, Spain\").\n"
f" Bei Online-Events: \"Online\". Nur wenn unklar: leerer String.\n"
f"- Mindestens 30 Events finden, davon mindestens 5 in CH/DE/AT/FR/IT.\n"
f"- specialties: Array mit passenden Fachrichtungen aus: {spec_text}\n"
f"- KEIN Markdown, KEIN Text ausserhalb der JSON-Zeilen."
)
return system, user
@staticmethod
def parse_response(raw: str, fallback_specs: List[str] = None) -> List[Dict[str, Any]]:
events: List[Dict[str, Any]] = []
for line in raw.split("\n"):
line = line.strip()
if not line or not line.startswith("{"):
continue
try:
obj = json.loads(line)
if isinstance(obj, dict) and "title" in obj:
url = _clean_url(obj.get("url", ""))
if not _valid_url(url):
continue
obj["url"] = url
obj["start_date"] = obj.get("start_date", "") or ""
obj["end_date"] = obj.get("end_date", "") or ""
obj.setdefault("source", "search")
obj.setdefault("cme_evidence", "")
if not obj.get("specialties") and fallback_specs:
obj["specialties"] = fallback_specs
obj["_parsed_start"] = _parse_date(obj["start_date"])
obj["_parsed_end"] = _parse_date(obj["end_date"])
events.append(obj)
except (json.JSONDecodeError, ValueError):
continue
return events
@staticmethod
def merge_all(registry_events: List[Dict[str, Any]],
db_events: List[Dict[str, Any]],
search_events: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
source_priority = {"registry": 0, "seed": 1, "search": 2}
by_key: Dict[str, Dict[str, Any]] = {}
for ev_list in [registry_events, db_events, search_events]:
for ev in ev_list:
url = ev.get("url", "")
canon = _canonical_url(url) if url else ""
nt = _norm_title(ev.get("title", ""))
sd = ev.get("start_date", "")
key = canon if canon else f"{nt}|{sd}"
if not key:
continue
existing = by_key.get(key)
if existing is None:
by_key[key] = dict(ev)
else:
ep = source_priority.get(existing.get("source", "search"), 2)
np = source_priority.get(ev.get("source", "search"), 2)
if np < ep:
by_key[key] = dict(ev)
else:
for field in ("start_date", "end_date", "location",
"cme", "format", "description", "type"):
if ev.get(field) and not existing.get(field):
existing[field] = ev[field]
merged = list(by_key.values())
for ev in merged:
ev["_parsed_start"] = _parse_date(ev.get("start_date", ""))
ev["_parsed_end"] = _parse_date(ev.get("end_date", ""))
today = date.today()
merged = [ev for ev in merged
if not ev.get("_parsed_start") or ev["_parsed_start"] >= today]
def _sk(e):
d = e.get("_parsed_start")
return (0, d) if d else (1, date.max)
merged.sort(key=_sk)
return merged
# ═══════════════════════════════════════════════════════════════════════════
# CongressWindow
# ═══════════════════════════════════════════════════════════════════════════
class CongressWindow:
_BG = "#f7fafc"
_HDR_BG = "#e3ecf4"
_HDR_FG = "#1e4060"
_CARD_BG = "#ffffff"
_TEXT_FG = "#23404f"
_LINK_FG = "#186aa5"
_SEP_FG = "#dce6f0"
_TYPE_LABELS = {
"congress": "Kongress", "symposium": "Symposium",
"workshop": "Workshop", "course": "Kurs / CME",
"webinar": "Webinar", "unknown": "",
}
_FORMAT_LABELS = {
"onsite": "", "online": "Online",
"hybrid": "Hybrid", "unknown": "",
}
def __init__(self, parent, openai_client, autotext_data: dict, save_fn):
self._parent = parent
self._client = openai_client
self._data = autotext_data
self._save_fn = save_fn
self._link_cnt = 0
self._font_size = 9
self._search_running = False
self._search_start = 0.0
self._inline_msg = ""
self._inline_var = tk.StringVar()
existing = getattr(parent, "_kongress_window", None)
if existing is not None:
try:
if existing.winfo_exists():
existing.deiconify()
existing.lift()
existing.focus_force()
return
except Exception:
pass
saved_specs = self._data.get("kongress_specialties")
if isinstance(saved_specs, list) and saved_specs:
self._selected: Set[str] = set(saved_specs)
else:
self._selected = {"Dermatologie und Venerologie"}
self._months: int = self._data.get("kongress_months_ahead", 12)
self._regions: List[str] = list(
self._data.get("kongress_regions", ["CH", "EU", "Weltweit"])
)
win = tk.Toplevel(parent)
self._win = win
parent._kongress_window = win
win.title("Kongresse & Weiterbildungen")
win.configure(bg=self._BG)
win.minsize(600, 520)
try:
sw = max(1200, int(parent.winfo_screenwidth()))
sh = max(800, int(parent.winfo_screenheight()))
w, h = max(620, int(sw * 0.36)), max(640, int(sh * 0.88))
win.geometry(f"{w}x{h}+8+40")
except Exception:
pass
self._build_header()
self._build_text_area()
self._build_status_bar()
self._run_pipeline()
# ── Header ──────────────────────────────────────────────────────────
def _build_header(self):
hdr = tk.Frame(self._win, bg=self._HDR_BG, padx=8, pady=5)
hdr.pack(fill="x")
tk.Label(hdr, text="Kongresse & Weiterbildungen", bg=self._HDR_BG,
fg=self._HDR_FG, font=("Segoe UI", 10, "bold")).pack(side="left")
right = tk.Frame(hdr, bg=self._HDR_BG)
right.pack(side="right")
rf = tk.Frame(right, bg=self._HDR_BG)
rf.pack(side="left", padx=(0, 6))
self._rv = {}
for r in ["CH", "EU", "Weltweit"]:
v = tk.BooleanVar(value=(r in self._regions))
self._rv[r] = v
ttk.Checkbutton(rf, text=r, variable=v, command=self._on_regions).pack(side="left", padx=1)
self._mv = tk.StringVar(value=f"{self._months} Monate")
cb = ttk.Combobox(right, textvariable=self._mv, width=10,
values=["6 Monate", "12 Monate", "24 Monate"], state="readonly")
cb.pack(side="left", padx=(0, 4))
cb.bind("<<ComboboxSelected>>", self._on_months)
ttk.Button(right, text="Fachrichtungen…",
command=self._open_spec_dlg).pack(side="left", padx=2)
ttk.Button(right, text="Suche starten",
command=self._run_pipeline).pack(side="left", padx=2)
ttk.Button(right, text="DB aktualisieren",
command=self._force_refresh).pack(side="left", padx=2)
self._db_label = tk.Label(right, text="", bg=self._HDR_BG, fg="#5a7a8c",
font=("Segoe UI", 7))
self._db_label.pack(side="left", padx=(6, 0))
self._update_db_label()
zf = tk.Frame(right, bg=self._HDR_BG)
zf.pack(side="left", padx=(8, 0))
tk.Label(zf, text="", bg=self._HDR_BG, fg="#3060a0",
font=("Segoe UI", 7, "bold"), cursor="hand2").pack(side="top", pady=0)
tk.Label(zf, text="", bg=self._HDR_BG, fg="#3060a0",
font=("Segoe UI", 7, "bold"), cursor="hand2").pack(side="top", pady=0)
for child in zf.winfo_children():
if child.cget("text") == "":
child.bind("<Button-1>", lambda e: self._change_font_size(1))
else:
child.bind("<Button-1>", lambda e: self._change_font_size(-1))
def _update_db_label(self):
t = _event_db.last_refresh_time()
n = _event_db.count()
self._db_label.configure(text=f"DB: {t} · {n} Events")
def _on_months(self, _e=None):
try:
self._months = int(self._mv.get().split()[0])
except Exception:
self._months = 12
self._data["kongress_months_ahead"] = self._months
self._persist()
def _on_regions(self):
self._regions = [r for r, v in self._rv.items() if v.get()]
if not self._regions:
self._regions = ["CH", "EU", "Weltweit"]
for v in self._rv.values():
v.set(True)
self._data["kongress_regions"] = self._regions
self._persist()
# ── Specialty dialog ────────────────────────────────────────────────
def _open_spec_dlg(self):
dlg = tk.Toplevel(self._win)
dlg.title("Fachrichtungen auswählen")
dlg.configure(bg="#f7fafc")
dlg.geometry("380x580")
dlg.transient(self._win)
dlg.grab_set()
self._center_on_screen(dlg, 380, 580)
top = tk.Frame(dlg, bg="#f7fafc")
top.pack(fill="x", padx=8, pady=(8, 4))
tk.Label(top, text="FMH/SIWF Facharzttitel", bg="#f7fafc", fg="#1e4060",
font=("Segoe UI", 9, "bold")).pack(side="left")
sv = tk.StringVar()
ttk.Entry(top, textvariable=sv, width=22).pack(side="right", padx=(6, 0))
tk.Label(top, text="Filter:", bg="#f7fafc", fg="#555",
font=("Segoe UI", 8)).pack(side="right")
lf = tk.Frame(dlg, bg="#fff")
lf.pack(fill="both", expand=True, padx=8, pady=4)
cvs = tk.Canvas(lf, bg="#fff", highlightthickness=0)
sb = ttk.Scrollbar(lf, orient="vertical", command=cvs.yview)
inner = tk.Frame(cvs, bg="#fff")
inner.bind("<Configure>", lambda e: cvs.configure(scrollregion=cvs.bbox("all")))
cvs.create_window((0, 0), window=inner, anchor="nw")
cvs.configure(yscrollcommand=sb.set)
sb.pack(side="right", fill="y")
cvs.pack(side="left", fill="both", expand=True)
titles = _spec_svc.get_titles()
cvar: dict[str, tk.BooleanVar] = {}
cwid: list[tuple[str, ttk.Checkbutton]] = []
for t in titles:
v = tk.BooleanVar(value=(t in self._selected))
cvar[t] = v
c = ttk.Checkbutton(inner, text=t, variable=v)
c.pack(anchor="w", padx=6)
cwid.append((t, c))
def _filt(*_):
q = sv.get().lower().strip()
for t, c in cwid:
if q and q not in t.lower():
c.pack_forget()
else:
c.pack(anchor="w", padx=6)
sv.trace_add("write", _filt)
def _mw(e):
cvs.yview_scroll(int(-1 * (e.delta / 120)), "units")
cvs.bind_all("<MouseWheel>", _mw)
bf = tk.Frame(dlg, bg="#f7fafc")
bf.pack(fill="x", padx=8, pady=6)
ttk.Button(bf, text="Alle",
command=lambda: [v.set(True) for v in cvar.values()]).pack(side="left", padx=2)
ttk.Button(bf, text="Keine",
command=lambda: [v.set(False) for v in cvar.values()]).pack(side="left", padx=2)
rl = tk.Label(bf, text="", bg="#f7fafc", fg="#555", font=("Segoe UI", 8))
rl.pack(side="left", padx=8)
def _rd(ok, n):
self._win.after(0, lambda: rl.configure(
text=f"{n} Titel geladen" if ok else "Fehlgeschlagen"))
ttk.Button(bf, text="Liste aktualisieren",
command=lambda: (rl.configure(text="Lade…"),
_spec_svc.refresh(callback=_rd))).pack(side="left", padx=2)
def _apply():
cvs.unbind_all("<MouseWheel>")
ch = {k for k, v in cvar.items() if v.get()}
if not ch:
self._show_select_specialty_hint(dlg)
return
self._selected = ch
self._data["kongress_specialties"] = sorted(self._selected)
self._persist()
dlg.destroy()
def _apply_search():
_apply()
self._run_pipeline()
def _close():
cvs.unbind_all("<MouseWheel>")
dlg.destroy()
ttk.Button(bf, text="Übernehmen", command=_apply).pack(side="right", padx=2)
ttk.Button(bf, text="Neue Suche starten", command=_apply_search).pack(side="right", padx=2)
dlg.protocol("WM_DELETE_WINDOW", _close)
def _center_on_screen(self, win: tk.Toplevel, width: int, height: int):
try:
sw = max(800, int(win.winfo_screenwidth()))
sh = max(600, int(win.winfo_screenheight()))
x = max(0, (sw - width) // 2)
y = max(0, (sh - height) // 2)
win.geometry(f"{width}x{height}+{x}+{y}")
except Exception:
pass
def _show_select_specialty_hint(self, parent: tk.Toplevel):
hint = tk.Toplevel(parent)
hint.title("Fachrichtung wählen")
hint.transient(parent)
hint.grab_set()
hint.configure(bg="#f7fafc")
self._center_on_screen(hint, 420, 150)
body = tk.Frame(hint, bg="#f7fafc", padx=16, pady=14)
body.pack(fill="both", expand=True)
tk.Label(
body,
text="Bitte mindestens eine Fachrichtung auswählen,\n"
"damit die Kongresssuche gezielt bleibt und keine unnötigen Tokens verbraucht.",
bg="#f7fafc",
fg="#1e4060",
justify="left",
anchor="w",
font=("Segoe UI", 9),
).pack(fill="x", pady=(0, 12))
ttk.Button(body, text="OK", command=hint.destroy).pack(anchor="e")
# ── Text area ───────────────────────────────────────────────────────
def _build_text_area(self):
f = tk.Frame(self._win, bg=self._CARD_BG, bd=0)
f.pack(fill="both", expand=True, padx=6, pady=(2, 4))
self._major_wrap = tk.Frame(f, bg=self._CARD_BG, bd=0)
self._major_wrap.pack(fill="x", side="top", pady=(0, 4))
self._major_wrap.pack_propagate(False)
self._major_text = tk.Text(
self._major_wrap, wrap="word", font=("Segoe UI", 9), bg=self._CARD_BG,
fg=self._TEXT_FG, relief="flat", padx=10, pady=8, cursor="arrow", spacing1=1, spacing3=1
)
sb_major = ttk.Scrollbar(self._major_wrap, orient="vertical", command=self._major_text.yview)
self._major_text.configure(yscrollcommand=sb_major.set)
sb_major.pack(side="right", fill="y")
self._major_text.pack(side="left", fill="both", expand=True)
self._major_text.configure(state="disabled")
self._main_wrap = tk.Frame(f, bg=self._CARD_BG, bd=0)
self._main_wrap.pack(fill="both", expand=True, side="top")
self._text = tk.Text(
self._main_wrap, wrap="word", font=("Segoe UI", 9), bg=self._CARD_BG,
fg=self._TEXT_FG, relief="flat", padx=10, pady=8, cursor="arrow", spacing1=1, spacing3=1
)
sb = ttk.Scrollbar(self._main_wrap, orient="vertical", command=self._text.yview)
self._text.configure(yscrollcommand=sb.set)
sb.pack(side="right", fill="y")
self._text.pack(side="left", fill="both", expand=True)
self._text.configure(state="disabled")
self._configure_text_tags(self._major_text)
self._configure_text_tags(self._text)
self._update_major_panel_height()
self._win.bind("<Configure>", self._on_window_resize_for_major_panel, add="+")
def _configure_text_tags(self, widget: tk.Text):
widget.tag_configure("month_hdr", font=("Segoe UI", 10, "bold"),
foreground="#0e3350", spacing1=10, spacing3=4,
background="#e0ecf5")
widget.tag_configure("section", font=("Segoe UI", 9, "bold"),
foreground="#2060a0", spacing1=6, spacing3=2)
widget.tag_configure("title", font=("Segoe UI", 9, "bold"),
foreground="#0e3350")
widget.tag_configure("normal", font=("Segoe UI", 9),
foreground="#2b4a5c")
widget.tag_configure("meta", font=("Segoe UI", 8),
foreground="#5a7a8c")
widget.tag_configure("type_tag", font=("Segoe UI", 7, "bold"),
foreground="#ffffff", background="#5090c0")
widget.tag_configure("loading", font=("Segoe UI", 9),
foreground="#6a9ab0")
widget.tag_configure("warn", font=("Segoe UI", 9),
foreground="#b07020")
widget.tag_configure("sep", font=("Segoe UI", 2),
foreground=self._SEP_FG)
def _on_window_resize_for_major_panel(self, _event=None):
self._update_major_panel_height()
def _update_major_panel_height(self):
try:
h = int(self._win.winfo_height())
max_h = max(180, min(520, h // 2))
self._major_wrap.configure(height=max_h)
except Exception:
pass
def _build_status_bar(self):
self._status = tk.StringVar(value="")
tk.Label(self._win, textvariable=self._status, bg=self._HDR_BG,
fg="#4a7a8c", font=("Segoe UI", 8), anchor="w",
padx=8).pack(fill="x", side="bottom")
def _change_font_size(self, delta: int):
new = max(6, min(18, self._font_size + delta))
if new == self._font_size:
return
self._font_size = new
self._apply_font_tags()
def _apply_font_tags(self):
s = self._font_size
for widget in (self._major_text, self._text):
widget.configure(font=("Segoe UI", s))
widget.tag_configure("month_hdr", font=("Segoe UI", s + 1, "bold"))
widget.tag_configure("section", font=("Segoe UI", s, "bold"))
widget.tag_configure("title", font=("Segoe UI", s, "bold"))
widget.tag_configure("normal", font=("Segoe UI", s))
widget.tag_configure("meta", font=("Segoe UI", s - 1))
widget.tag_configure("type_tag", font=("Segoe UI", s - 2, "bold"))
widget.tag_configure("loading", font=("Segoe UI", s))
widget.tag_configure("warn", font=("Segoe UI", s))
for i in range(1, self._link_cnt + 1):
widget.tag_configure(f"cl_{i}", font=("Segoe UI", s - 1, "underline"))
# ── Render helpers ──────────────────────────────────────────────────
def _open_url(self, url):
try:
webbrowser.open(url)
except Exception:
pass
def _insert_link(self, text_widget: tk.Text, url: str, label: Optional[str] = None):
self._link_cnt += 1
tag = f"cl_{self._link_cnt}"
text_widget.tag_configure(tag, font=("Segoe UI", 8, "underline"),
foreground=self._LINK_FG)
text_widget.tag_bind(tag, "<Button-1>", lambda e, u=url: self._open_url(u))
text_widget.tag_bind(tag, "<Enter>", lambda e, w=text_widget: w.configure(cursor="hand2"))
text_widget.tag_bind(tag, "<Leave>", lambda e, w=text_widget: w.configure(cursor="arrow"))
text_widget.tag_bind(tag, "<Button-3>", lambda e, u=url, w=text_widget: self._show_link_menu(w, e, u))
text_widget.insert("end", label or url, tag)
def _show_link_menu(self, text_widget: tk.Text, event, url: str):
m = tk.Menu(text_widget, tearoff=0, font=("Segoe UI", 9))
m.add_command(label="Link kopieren", command=lambda: self._copy(url))
m.add_command(label="Im Browser öffnen", command=lambda: self._open_url(url))
try:
m.tk_popup(event.x_root, event.y_root)
finally:
m.grab_release()
def _insert_btn(self, text_widget: tk.Text, label, bg, fg, cmd):
btn = tk.Label(text_widget, text=f" {label} ", bg=bg, fg=fg,
font=("Segoe UI", 7, "bold"), cursor="hand2", relief="flat", padx=3)
btn.bind("<Button-1>", lambda e: cmd())
text_widget.window_create("end", window=btn, padx=2)
def _copy(self, text):
try:
self._win.clipboard_clear()
self._win.clipboard_append(text)
except Exception:
pass
def _format_date_range(self, ev: Dict) -> str:
sd = ev.get("_parsed_start")
ed = ev.get("_parsed_end")
if sd and ed and sd != ed:
return f"{sd.strftime('%d.%m.%Y')} {ed.strftime('%d.%m.%Y')}"
if sd:
return sd.strftime("%d.%m.%Y")
raw_s = ev.get("start_date", "")
raw_e = ev.get("end_date", "")
if raw_s:
return f"{raw_s} {raw_e}" if raw_e and raw_e != raw_s else raw_s
return ""
def _render_card(self, text_widget: tk.Text, ev: Dict):
url = ev.get("url", "")
title = ev.get("title", "Unbekannt")
typ = ev.get("type", "unknown")
fmt = ev.get("format", "unknown")
cme = ev.get("cme", "")
loc = ev.get("location", "")
desc = ev.get("description", "")
org = ev.get("_org", "")
if _is_css_junk(loc):
loc = ""
if _is_css_junk(desc):
desc = ""
if _is_css_junk(org):
org = ""
if _is_css_junk(title):
title = "Unbekannt"
type_label = self._TYPE_LABELS.get(typ, "")
fmt_label = self._FORMAT_LABELS.get(fmt, "")
date_str = self._format_date_range(ev)
text_widget.insert("end", " ")
if type_label:
text_widget.insert("end", f" {type_label} ", "type_tag")
text_widget.insert("end", " ")
text_widget.insert("end", title, "title")
text_widget.insert("end", "\n")
meta = []
if date_str:
conf = str(ev.get("_date_confidence", "") or "").lower().strip()
if conf == "high":
date_note = "Datum: verifiziert"
elif conf == "medium":
date_note = "Datum: bitte prüfen"
else:
date_note = "Datum: unsicher"
meta.append(f"{date_str} · {date_note}")
if loc:
meta.append(loc)
if fmt_label:
meta.append(fmt_label)
if cme:
meta.append(f"CME: {cme}")
if meta:
text_widget.insert("end", f" {' · '.join(meta)}\n", "meta")
cme_ev = ev.get("cme_evidence", "")
if cme_ev and cme:
text_widget.insert("end", f" CME Hinweis: {cme_ev}\n", "meta")
if org:
text_widget.insert("end", f" {org}\n", "meta")
if desc:
text_widget.insert("end", f" {desc}\n", "normal")
if url:
text_widget.insert("end", " ")
self._insert_link(text_widget, url)
text_widget.insert("end", "\n\n")
else:
search_q = title
if loc:
search_q += f" {loc}"
if date_str:
search_q += f" {date_str}"
from urllib.parse import quote_plus as _qp
google_url = "https://www.google.com/search?q=" + _qp(search_q)
text_widget.insert("end", " ")
self._insert_btn(text_widget, "Im Google suchen", "#f0e8d8", "#6a5020",
lambda u=google_url: self._open_url(u))
text_widget.insert("end", "\n\n")
def _render_events(self, text_widget: tk.Text, events: List[Dict]):
current_month = ""
for ev in events:
sd = ev.get("_parsed_start")
if sd:
ml = _month_label(sd)
if ml != current_month:
current_month = ml
text_widget.insert("end", f" {ml}\n", "month_hdr")
text_widget.insert("end", "\n")
elif current_month != "__unknown__":
current_month = "__unknown__"
text_widget.insert("end", " Weiteres\n", "month_hdr")
text_widget.insert("end", "\n")
self._render_card(text_widget, ev)
# ── Pipeline ────────────────────────────────────────────────────────
def _persist(self):
try:
self._save_fn(self._data)
except Exception:
pass
def _force_refresh(self):
self._run_pipeline(force_refresh=True)
def _query_events_with_fallback(
self, specialties: Set[str], months: int, regions: List[str]
) -> tuple[List[Dict[str, Any]], str]:
primary = _event_db.query(specialties, months, regions)
if len(primary) >= 20:
return primary, ""
relaxed_specs = _event_db.query(set(), months, regions)
if len(relaxed_specs) > len(primary):
return relaxed_specs, "Erweiterte Anzeige: alle Fachrichtungen."
relaxed_months = max(24, int(months))
relaxed_all = _event_db.query(set(), relaxed_months, ["CH", "EU", "Weltweit"])
if len(relaxed_all) > len(primary):
return relaxed_all, "Erweiterte Anzeige: alle Fachrichtungen, Regionen und 24 Monate."
return primary, ""
def _set_inline_status(self, text: str):
self._inline_msg = text
elapsed = int(time.time() - getattr(self, "_search_start", time.time()))
self._inline_var.set(f" {text} ({elapsed} Sek.)")
def _tick_inline_timer(self):
if not getattr(self, "_search_running", False):
return
elapsed = int(time.time() - self._search_start)
msg = getattr(self, "_inline_msg", "")
full = f" {msg} ({elapsed} Sek.)" if msg else f" ({elapsed} Sek.)"
self._inline_var.set(full)
self._status.set(f"{msg} ({elapsed} Sek.)" if msg else f"({elapsed} Sek.)")
self._win.after(1000, self._tick_inline_timer)
def _run_pipeline(self, force_refresh: bool = False):
self._major_text.configure(state="normal")
self._major_text.delete("1.0", "end")
self._text.configure(state="normal")
self._text.delete("1.0", "end")
self._link_cnt = 0
registry_events = MajorCongressRegistry.as_db_events(self._selected)
if registry_events:
self._major_text.insert("end", " Wichtige Hauptkongresse\n", "month_hdr")
self._major_text.insert("end", "\n")
cur_spec = ""
for ev in registry_events:
specs = ev.get("specialties", [])
sp = specs[0] if specs else ""
if sp != cur_spec:
cur_spec = sp
self._major_text.insert("end", f" {sp}\n", "section")
self._render_card(self._major_text, ev)
self._major_text.insert("end", "" * 50 + "\n", "sep")
self._major_text.insert("end", "\n")
self._major_wrap.pack(fill="x", side="top", pady=(0, 4))
else:
self._major_wrap.pack_forget()
cached, fallback_note = self._query_events_with_fallback(
self._selected, self._months, self._regions
)
_validate_cme_batch(cached)
if cached:
self._text.insert("end", f" Gespeicherte Events ({len(cached)})\n", "month_hdr")
self._text.insert("end", "\n")
self._render_events(self._text, cached)
self._text.insert("end", "" * 50 + "\n", "sep")
self._text.insert("end", "\n")
if fallback_note:
self._text.insert("end", f" {fallback_note}\n", "meta")
self._text.insert("end", "\n")
do_refresh = force_refresh or _event_db.needs_refresh()
if do_refresh:
self._text.insert("end", " Aktualisiere Datenbank …\n", "loading")
self._inline_var = tk.StringVar(value=" ")
lbl = tk.Label(self._text, textvariable=self._inline_var,
bg=self._CARD_BG, fg="#6a9ab0", font=("Segoe UI", 9),
anchor="w")
self._text.window_create("end", window=lbl)
self._text.insert("end", "\n")
else:
self._text.insert("end", f" Datenbank aktuell (Stand: {_event_db.last_refresh_time()})\n", "meta")
self._major_text.configure(state="disabled")
self._text.configure(state="disabled")
self._update_db_label()
specs = sorted(self._selected) if self._selected else ["Dermatologie und Venerologie"]
self._status.set(f"{'Aktualisiere' if do_refresh else 'Geladen'} · {', '.join(specs)}")
if not do_refresh:
return
self._search_start = time.time()
self._search_running = True
self._tick_inline_timer()
def _bg_job():
all_new: List[Dict[str, Any]] = []
def _update_inline(txt):
self._win.after(0, lambda: self._set_inline_status(txt))
def _progress(i, t, name):
msg = f"Seed-Crawling {i}/{t}: {name}"
self._win.after(0, lambda: self._status.set(msg))
_update_inline(msg)
try:
seed_events = SeedCrawler.crawl_for_specialties(
self._selected, progress_cb=_progress)
all_new.extend(seed_events)
msg = f"Seed: {len(seed_events)} Events · starte Websuche …"
self._win.after(0, lambda: self._status.set(msg))
_update_inline(msg)
except Exception:
pass
try:
if self._client:
major_names = MajorCongressRegistry.names_for(self._selected)
system, user = CongressSearchEngine.build_prompt(
specs, self._months, self._regions, major_names)
model = os.getenv(
"CONGRESS_SEARCH_MODEL",
os.getenv("NEWS_SEARCH_MODEL", "gpt-4o-mini-search-preview")
).strip()
resp = self._client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
)
raw = (resp.choices[0].message.content or "").strip()
search_events = CongressSearchEngine.parse_response(raw, specs)
all_new.extend(search_events)
except Exception:
pass
_validate_cme_batch(all_new)
_update_inline("Link-Prüfung …")
checkable = [e for e in all_new if e.get("source") != "registry" and e.get("url")]
check_count = 0
for ev in checkable:
if check_count >= 180:
break
check_count += 1
msg = f"Link-Prüfung {check_count}/{min(len(checkable), 180)}"
self._win.after(0, lambda m=msg: self._status.set(m))
_update_inline(msg)
_validate_search_event_url(ev)
_event_db.merge_in(all_new)
_event_db.save()
self._search_running = False
final_cached, _ = self._query_events_with_fallback(
self._selected, self._months, self._regions
)
_validate_cme_batch(final_cached)
final = CongressSearchEngine.merge_all(registry_events, final_cached, [])
# Vor Anzeige Links nach Möglichkeit verifizieren:
# keine Events entfernen, nur fehlerhafte Links auf Google-Fallback umstellen.
registry_first = [e for e in final if e.get("source") == "registry" and e.get("url")]
others = [e for e in final if e.get("source") != "registry" and e.get("url")]
checked = 0
for ev in registry_first + others:
if checked >= 220:
break
checked += 1
_validate_search_event_url(ev)
self._win.after(0, lambda: self._show_final(final))
threading.Thread(target=_bg_job, daemon=True).start()
def _show_final(self, events: List[Dict]):
self._major_text.configure(state="normal")
self._major_text.delete("1.0", "end")
self._text.configure(state="normal")
self._text.delete("1.0", "end")
self._link_cnt = 0
self._update_db_label()
registry_events = MajorCongressRegistry.as_db_events(self._selected)
if registry_events:
self._major_text.insert("end", " Wichtige Hauptkongresse\n", "month_hdr")
self._major_text.insert("end", "\n")
cur_spec = ""
for ev in registry_events:
specs = ev.get("specialties", [])
sp = specs[0] if specs else ""
if sp != cur_spec:
cur_spec = sp
self._major_text.insert("end", f" {sp}\n", "section")
self._render_card(self._major_text, ev)
self._major_text.insert("end", "" * 50 + "\n", "sep")
self._major_text.insert("end", "\n")
self._major_wrap.pack(fill="x", side="top", pady=(0, 4))
else:
self._major_wrap.pack_forget()
non_registry = [e for e in events if e.get("source") != "registry"]
if not non_registry and not registry_events:
self._text.insert("end", " Keine Events gefunden.\n", "warn")
self._major_text.configure(state="disabled")
self._text.configure(state="disabled")
self._status.set("Keine Events")
return
if non_registry:
self._text.insert("end", f" Kongresse & Weiterbildungen ({len(non_registry)})\n", "month_hdr")
self._text.insert("end", "\n")
self._render_events(self._text, non_registry)
self._major_text.configure(state="disabled")
self._text.configure(state="disabled")
total = len(registry_events) + len(non_registry)
self._status.set(
f"Fertig · {total} Events · DB: {_event_db.last_refresh_time()}")