Files
aza/AzA march 2026 - Kopie (6)/services/link_verify.py
2026-04-16 13:32:32 +02:00

168 lines
5.7 KiB
Python

from __future__ import annotations
import re
import threading
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from typing import Any
from urllib.parse import urljoin, urlparse
import requests
URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60
_verify_lock = threading.Lock()
_verify_cache: dict[str, tuple["LinkVerification", float]] = {}
@dataclass(frozen=True)
class LinkVerification:
httpStatus: int
finalUrl: str
redirectCount: int
isDirectEventPage: bool
checkedAt: str
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _url_slug_tokens(url: str) -> set[str]:
path = urlparse(url).path.lower()
parts = [p for p in re.split(r"[^a-z0-9]+", path) if p]
return set(parts)
def _name_tokens(name: str) -> set[str]:
return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4}
def _extract_title(html: str) -> str:
m = re.search(r"<title[^>]*>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
if not m:
return ""
t = re.sub(r"\s+", " ", m.group(1)).strip()
return t[:300]
def _looks_generic_url(final_url: str) -> bool:
path = (urlparse(final_url).path or "/").strip().lower().rstrip("/")
return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"}
def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool:
score = 0
slug = _url_slug_tokens(final_url)
name_tokens = _name_tokens(event_name)
title_l = (title_text or "").lower()
if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()):
score += 2
if slug.intersection(name_tokens):
score += 2
if any(tok in title_l for tok in list(name_tokens)[:5]):
score += 1
if re.search(r"\b20\d{2}\b", title_l):
score += 1
if _looks_generic_url(final_url):
score -= 2
return score >= 2
def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]:
hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE)
base_host = (urlparse(base_url).netloc or "").lower()
tokens = _name_tokens(event_name)
out: list[str] = []
for href in hrefs:
u = urljoin(base_url, href)
p = urlparse(u)
if p.scheme not in {"http", "https"}:
continue
if (p.netloc or "").lower() != base_host:
continue
path_l = p.path.lower()
if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens):
if u not in out:
out.append(u)
if len(out) >= 8:
break
return out
def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None:
headers = {"User-Agent": "AZA-LiveEventSearch/1.0"}
try:
r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers)
if r.status_code >= 400 or r.status_code == 405:
r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
return r
except Exception:
try:
return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
except Exception:
return None
def verify_event_url(url: str, event_name: str) -> LinkVerification:
u = (url or "").strip()
if not u:
return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
now_ts = datetime.now(timezone.utc).timestamp()
with _verify_lock:
cached = _verify_cache.get(u)
if cached and cached[1] > now_ts:
return cached[0]
resp = _request_with_fallback(u, timeout_s=5)
if resp is None:
out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
with _verify_lock:
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
return out
final_url = str(resp.url or u)
status = int(resp.status_code or 0)
redirects = len(resp.history or [])
text = resp.text[:12000] if isinstance(resp.text, str) else ""
title_text = _extract_title(text)
is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text)
if (200 <= status < 300) and not is_direct and text:
for cand in _extract_candidate_links(final_url, text, event_name=event_name):
second = _request_with_fallback(cand, timeout_s=5)
if second is None:
continue
second_status = int(second.status_code or 0)
second_final = str(second.url or cand)
second_redirects = len(second.history or [])
second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "")
second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title)
if second_direct:
out = LinkVerification(
httpStatus=second_status,
finalUrl=second_final,
redirectCount=second_redirects,
isDirectEventPage=True,
checkedAt=_now_iso(),
)
with _verify_lock:
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
return out
out = LinkVerification(
httpStatus=status,
finalUrl=final_url,
redirectCount=redirects,
isDirectEventPage=is_direct,
checkedAt=_now_iso(),
)
with _verify_lock:
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
return out
def verification_to_dict(v: LinkVerification) -> dict[str, Any]:
return asdict(v)