from __future__ import annotations import re import threading from dataclasses import asdict, dataclass from datetime import datetime, timezone from typing import Any from urllib.parse import urljoin, urlparse import requests URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60 _verify_lock = threading.Lock() _verify_cache: dict[str, tuple["LinkVerification", float]] = {} @dataclass(frozen=True) class LinkVerification: httpStatus: int finalUrl: str redirectCount: int isDirectEventPage: bool checkedAt: str def _now_iso() -> str: return datetime.now(timezone.utc).isoformat() def _url_slug_tokens(url: str) -> set[str]: path = urlparse(url).path.lower() parts = [p for p in re.split(r"[^a-z0-9]+", path) if p] return set(parts) def _name_tokens(name: str) -> set[str]: return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4} def _extract_title(html: str) -> str: m = re.search(r"]*>(.*?)", html or "", flags=re.IGNORECASE | re.DOTALL) if not m: return "" t = re.sub(r"\s+", " ", m.group(1)).strip() return t[:300] def _looks_generic_url(final_url: str) -> bool: path = (urlparse(final_url).path or "/").strip().lower().rstrip("/") return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"} def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool: score = 0 slug = _url_slug_tokens(final_url) name_tokens = _name_tokens(event_name) title_l = (title_text or "").lower() if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()): score += 2 if slug.intersection(name_tokens): score += 2 if any(tok in title_l for tok in list(name_tokens)[:5]): score += 1 if re.search(r"\b20\d{2}\b", title_l): score += 1 if _looks_generic_url(final_url): score -= 2 return score >= 2 def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]: hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE) base_host = (urlparse(base_url).netloc or "").lower() tokens = _name_tokens(event_name) out: list[str] = [] for href in hrefs: u = urljoin(base_url, href) p = urlparse(u) if p.scheme not in {"http", "https"}: continue if (p.netloc or "").lower() != base_host: continue path_l = p.path.lower() if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens): if u not in out: out.append(u) if len(out) >= 8: break return out def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None: headers = {"User-Agent": "AZA-LiveEventSearch/1.0"} try: r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers) if r.status_code >= 400 or r.status_code == 405: r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers) return r except Exception: try: return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers) except Exception: return None def verify_event_url(url: str, event_name: str) -> LinkVerification: u = (url or "").strip() if not u: return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso()) now_ts = datetime.now(timezone.utc).timestamp() with _verify_lock: cached = _verify_cache.get(u) if cached and cached[1] > now_ts: return cached[0] resp = _request_with_fallback(u, timeout_s=5) if resp is None: out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso()) with _verify_lock: _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS) return out final_url = str(resp.url or u) status = int(resp.status_code or 0) redirects = len(resp.history or []) text = resp.text[:12000] if isinstance(resp.text, str) else "" title_text = _extract_title(text) is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text) if (200 <= status < 300) and not is_direct and text: for cand in _extract_candidate_links(final_url, text, event_name=event_name): second = _request_with_fallback(cand, timeout_s=5) if second is None: continue second_status = int(second.status_code or 0) second_final = str(second.url or cand) second_redirects = len(second.history or []) second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "") second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title) if second_direct: out = LinkVerification( httpStatus=second_status, finalUrl=second_final, redirectCount=second_redirects, isDirectEventPage=True, checkedAt=_now_iso(), ) with _verify_lock: _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS) return out out = LinkVerification( httpStatus=status, finalUrl=final_url, redirectCount=redirects, isDirectEventPage=is_direct, checkedAt=_now_iso(), ) with _verify_lock: _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS) return out def verification_to_dict(v: LinkVerification) -> dict[str, Any]: return asdict(v)