168 lines
5.7 KiB
Python
168 lines
5.7 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import re
|
||
|
|
import threading
|
||
|
|
from dataclasses import asdict, dataclass
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from typing import Any
|
||
|
|
from urllib.parse import urljoin, urlparse
|
||
|
|
|
||
|
|
import requests
|
||
|
|
|
||
|
|
|
||
|
|
URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60
|
||
|
|
_verify_lock = threading.Lock()
|
||
|
|
_verify_cache: dict[str, tuple["LinkVerification", float]] = {}
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class LinkVerification:
|
||
|
|
httpStatus: int
|
||
|
|
finalUrl: str
|
||
|
|
redirectCount: int
|
||
|
|
isDirectEventPage: bool
|
||
|
|
checkedAt: str
|
||
|
|
|
||
|
|
|
||
|
|
def _now_iso() -> str:
|
||
|
|
return datetime.now(timezone.utc).isoformat()
|
||
|
|
|
||
|
|
|
||
|
|
def _url_slug_tokens(url: str) -> set[str]:
|
||
|
|
path = urlparse(url).path.lower()
|
||
|
|
parts = [p for p in re.split(r"[^a-z0-9]+", path) if p]
|
||
|
|
return set(parts)
|
||
|
|
|
||
|
|
|
||
|
|
def _name_tokens(name: str) -> set[str]:
|
||
|
|
return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4}
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_title(html: str) -> str:
|
||
|
|
m = re.search(r"<title[^>]*>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
|
||
|
|
if not m:
|
||
|
|
return ""
|
||
|
|
t = re.sub(r"\s+", " ", m.group(1)).strip()
|
||
|
|
return t[:300]
|
||
|
|
|
||
|
|
|
||
|
|
def _looks_generic_url(final_url: str) -> bool:
|
||
|
|
path = (urlparse(final_url).path or "/").strip().lower().rstrip("/")
|
||
|
|
return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"}
|
||
|
|
|
||
|
|
|
||
|
|
def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool:
|
||
|
|
score = 0
|
||
|
|
slug = _url_slug_tokens(final_url)
|
||
|
|
name_tokens = _name_tokens(event_name)
|
||
|
|
title_l = (title_text or "").lower()
|
||
|
|
if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()):
|
||
|
|
score += 2
|
||
|
|
if slug.intersection(name_tokens):
|
||
|
|
score += 2
|
||
|
|
if any(tok in title_l for tok in list(name_tokens)[:5]):
|
||
|
|
score += 1
|
||
|
|
if re.search(r"\b20\d{2}\b", title_l):
|
||
|
|
score += 1
|
||
|
|
if _looks_generic_url(final_url):
|
||
|
|
score -= 2
|
||
|
|
return score >= 2
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]:
|
||
|
|
hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE)
|
||
|
|
base_host = (urlparse(base_url).netloc or "").lower()
|
||
|
|
tokens = _name_tokens(event_name)
|
||
|
|
out: list[str] = []
|
||
|
|
for href in hrefs:
|
||
|
|
u = urljoin(base_url, href)
|
||
|
|
p = urlparse(u)
|
||
|
|
if p.scheme not in {"http", "https"}:
|
||
|
|
continue
|
||
|
|
if (p.netloc or "").lower() != base_host:
|
||
|
|
continue
|
||
|
|
path_l = p.path.lower()
|
||
|
|
if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens):
|
||
|
|
if u not in out:
|
||
|
|
out.append(u)
|
||
|
|
if len(out) >= 8:
|
||
|
|
break
|
||
|
|
return out
|
||
|
|
|
||
|
|
|
||
|
|
def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None:
|
||
|
|
headers = {"User-Agent": "AZA-LiveEventSearch/1.0"}
|
||
|
|
try:
|
||
|
|
r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers)
|
||
|
|
if r.status_code >= 400 or r.status_code == 405:
|
||
|
|
r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||
|
|
return r
|
||
|
|
except Exception:
|
||
|
|
try:
|
||
|
|
return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||
|
|
except Exception:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def verify_event_url(url: str, event_name: str) -> LinkVerification:
|
||
|
|
u = (url or "").strip()
|
||
|
|
if not u:
|
||
|
|
return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||
|
|
now_ts = datetime.now(timezone.utc).timestamp()
|
||
|
|
with _verify_lock:
|
||
|
|
cached = _verify_cache.get(u)
|
||
|
|
if cached and cached[1] > now_ts:
|
||
|
|
return cached[0]
|
||
|
|
|
||
|
|
resp = _request_with_fallback(u, timeout_s=5)
|
||
|
|
if resp is None:
|
||
|
|
out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||
|
|
with _verify_lock:
|
||
|
|
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||
|
|
return out
|
||
|
|
|
||
|
|
final_url = str(resp.url or u)
|
||
|
|
status = int(resp.status_code or 0)
|
||
|
|
redirects = len(resp.history or [])
|
||
|
|
text = resp.text[:12000] if isinstance(resp.text, str) else ""
|
||
|
|
title_text = _extract_title(text)
|
||
|
|
is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text)
|
||
|
|
|
||
|
|
if (200 <= status < 300) and not is_direct and text:
|
||
|
|
for cand in _extract_candidate_links(final_url, text, event_name=event_name):
|
||
|
|
second = _request_with_fallback(cand, timeout_s=5)
|
||
|
|
if second is None:
|
||
|
|
continue
|
||
|
|
second_status = int(second.status_code or 0)
|
||
|
|
second_final = str(second.url or cand)
|
||
|
|
second_redirects = len(second.history or [])
|
||
|
|
second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "")
|
||
|
|
second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title)
|
||
|
|
if second_direct:
|
||
|
|
out = LinkVerification(
|
||
|
|
httpStatus=second_status,
|
||
|
|
finalUrl=second_final,
|
||
|
|
redirectCount=second_redirects,
|
||
|
|
isDirectEventPage=True,
|
||
|
|
checkedAt=_now_iso(),
|
||
|
|
)
|
||
|
|
with _verify_lock:
|
||
|
|
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||
|
|
return out
|
||
|
|
|
||
|
|
out = LinkVerification(
|
||
|
|
httpStatus=status,
|
||
|
|
finalUrl=final_url,
|
||
|
|
redirectCount=redirects,
|
||
|
|
isDirectEventPage=is_direct,
|
||
|
|
checkedAt=_now_iso(),
|
||
|
|
)
|
||
|
|
with _verify_lock:
|
||
|
|
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||
|
|
return out
|
||
|
|
|
||
|
|
|
||
|
|
def verification_to_dict(v: LinkVerification) -> dict[str, Any]:
|
||
|
|
return asdict(v)
|
||
|
|
|