aza/AzA march 2026 - Kopie (6)/services/link_verify.py

from __future__ import annotations

import re
import threading
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from typing import Any
from urllib.parse import urljoin, urlparse

import requests


URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60
_verify_lock = threading.Lock()
_verify_cache: dict[str, tuple["LinkVerification", float]] = {}


@dataclass(frozen=True)
class LinkVerification:
    httpStatus: int
    finalUrl: str
    redirectCount: int
    isDirectEventPage: bool
    checkedAt: str


def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def _url_slug_tokens(url: str) -> set[str]:
    path = urlparse(url).path.lower()
    parts = [p for p in re.split(r"[^a-z0-9]+", path) if p]
    return set(parts)


def _name_tokens(name: str) -> set[str]:
    return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4}


def _extract_title(html: str) -> str:
    m = re.search(r"<title[^>]*>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
    if not m:
        return ""
    t = re.sub(r"\s+", " ", m.group(1)).strip()
    return t[:300]


def _looks_generic_url(final_url: str) -> bool:
    path = (urlparse(final_url).path or "/").strip().lower().rstrip("/")
    return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"}


def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool:
    score = 0
    slug = _url_slug_tokens(final_url)
    name_tokens = _name_tokens(event_name)
    title_l = (title_text or "").lower()
    if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()):
        score += 2
    if slug.intersection(name_tokens):
        score += 2
    if any(tok in title_l for tok in list(name_tokens)[:5]):
        score += 1
    if re.search(r"\b20\d{2}\b", title_l):
        score += 1
    if _looks_generic_url(final_url):
        score -= 2
    return score >= 2


def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]:
    hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE)
    base_host = (urlparse(base_url).netloc or "").lower()
    tokens = _name_tokens(event_name)
    out: list[str] = []
    for href in hrefs:
        u = urljoin(base_url, href)
        p = urlparse(u)
        if p.scheme not in {"http", "https"}:
            continue
        if (p.netloc or "").lower() != base_host:
            continue
        path_l = p.path.lower()
        if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens):
            if u not in out:
                out.append(u)
        if len(out) >= 8:
            break
    return out


def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None:
    headers = {"User-Agent": "AZA-LiveEventSearch/1.0"}
    try:
        r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers)
        if r.status_code >= 400 or r.status_code == 405:
            r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
        return r
    except Exception:
        try:
            return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
        except Exception:
            return None


def verify_event_url(url: str, event_name: str) -> LinkVerification:
    u = (url or "").strip()
    if not u:
        return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
    now_ts = datetime.now(timezone.utc).timestamp()
    with _verify_lock:
        cached = _verify_cache.get(u)
        if cached and cached[1] > now_ts:
            return cached[0]

    resp = _request_with_fallback(u, timeout_s=5)
    if resp is None:
        out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
        with _verify_lock:
            _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
        return out

    final_url = str(resp.url or u)
    status = int(resp.status_code or 0)
    redirects = len(resp.history or [])
    text = resp.text[:12000] if isinstance(resp.text, str) else ""
    title_text = _extract_title(text)
    is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text)

    if (200 <= status < 300) and not is_direct and text:
        for cand in _extract_candidate_links(final_url, text, event_name=event_name):
            second = _request_with_fallback(cand, timeout_s=5)
            if second is None:
                continue
            second_status = int(second.status_code or 0)
            second_final = str(second.url or cand)
            second_redirects = len(second.history or [])
            second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "")
            second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title)
            if second_direct:
                out = LinkVerification(
                    httpStatus=second_status,
                    finalUrl=second_final,
                    redirectCount=second_redirects,
                    isDirectEventPage=True,
                    checkedAt=_now_iso(),
                )
                with _verify_lock:
                    _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
                return out

    out = LinkVerification(
        httpStatus=status,
        finalUrl=final_url,
        redirectCount=redirects,
        isDirectEventPage=is_direct,
        checkedAt=_now_iso(),
    )
    with _verify_lock:
        _verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
    return out


def verification_to_dict(v: LinkVerification) -> dict[str, Any]:
    return asdict(v)