update
This commit is contained in:
167
AzA march 2026/services/link_verify.py
Normal file
167
AzA march 2026/services/link_verify.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import threading
|
||||
from dataclasses import asdict, dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
URL_VERIFY_TTL_SECONDS = 7 * 24 * 60 * 60
|
||||
_verify_lock = threading.Lock()
|
||||
_verify_cache: dict[str, tuple["LinkVerification", float]] = {}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LinkVerification:
|
||||
httpStatus: int
|
||||
finalUrl: str
|
||||
redirectCount: int
|
||||
isDirectEventPage: bool
|
||||
checkedAt: str
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _url_slug_tokens(url: str) -> set[str]:
|
||||
path = urlparse(url).path.lower()
|
||||
parts = [p for p in re.split(r"[^a-z0-9]+", path) if p]
|
||||
return set(parts)
|
||||
|
||||
|
||||
def _name_tokens(name: str) -> set[str]:
|
||||
return {p for p in re.split(r"[^a-z0-9]+", (name or "").lower()) if len(p) >= 4}
|
||||
|
||||
|
||||
def _extract_title(html: str) -> str:
|
||||
m = re.search(r"<title[^>]*>(.*?)</title>", html or "", flags=re.IGNORECASE | re.DOTALL)
|
||||
if not m:
|
||||
return ""
|
||||
t = re.sub(r"\s+", " ", m.group(1)).strip()
|
||||
return t[:300]
|
||||
|
||||
|
||||
def _looks_generic_url(final_url: str) -> bool:
|
||||
path = (urlparse(final_url).path or "/").strip().lower().rstrip("/")
|
||||
return path in {"", "/", "/events", "/calendar", "/home", "/congress", "/meetings"}
|
||||
|
||||
|
||||
def _is_direct_event_page(final_url: str, event_name: str, title_text: str = "") -> bool:
|
||||
score = 0
|
||||
slug = _url_slug_tokens(final_url)
|
||||
name_tokens = _name_tokens(event_name)
|
||||
title_l = (title_text or "").lower()
|
||||
if re.search(r"/20\d{2}(?:[/-]|$)", final_url.lower()):
|
||||
score += 2
|
||||
if slug.intersection(name_tokens):
|
||||
score += 2
|
||||
if any(tok in title_l for tok in list(name_tokens)[:5]):
|
||||
score += 1
|
||||
if re.search(r"\b20\d{2}\b", title_l):
|
||||
score += 1
|
||||
if _looks_generic_url(final_url):
|
||||
score -= 2
|
||||
return score >= 2
|
||||
|
||||
|
||||
def _extract_candidate_links(base_url: str, html: str, event_name: str) -> list[str]:
|
||||
hrefs = re.findall(r'href=["\']([^"\']+)["\']', html or "", flags=re.IGNORECASE)
|
||||
base_host = (urlparse(base_url).netloc or "").lower()
|
||||
tokens = _name_tokens(event_name)
|
||||
out: list[str] = []
|
||||
for href in hrefs:
|
||||
u = urljoin(base_url, href)
|
||||
p = urlparse(u)
|
||||
if p.scheme not in {"http", "https"}:
|
||||
continue
|
||||
if (p.netloc or "").lower() != base_host:
|
||||
continue
|
||||
path_l = p.path.lower()
|
||||
if re.search(r"/20\d{2}(?:[/-]|$)", path_l) or any(tok in path_l for tok in tokens):
|
||||
if u not in out:
|
||||
out.append(u)
|
||||
if len(out) >= 8:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
def _request_with_fallback(url: str, timeout_s: int = 5) -> requests.Response | None:
|
||||
headers = {"User-Agent": "AZA-LiveEventSearch/1.0"}
|
||||
try:
|
||||
r = requests.head(url, allow_redirects=True, timeout=timeout_s, headers=headers)
|
||||
if r.status_code >= 400 or r.status_code == 405:
|
||||
r = requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||||
return r
|
||||
except Exception:
|
||||
try:
|
||||
return requests.get(url, allow_redirects=True, timeout=timeout_s + 2, headers=headers)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def verify_event_url(url: str, event_name: str) -> LinkVerification:
|
||||
u = (url or "").strip()
|
||||
if not u:
|
||||
return LinkVerification(httpStatus=0, finalUrl="", redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||||
now_ts = datetime.now(timezone.utc).timestamp()
|
||||
with _verify_lock:
|
||||
cached = _verify_cache.get(u)
|
||||
if cached and cached[1] > now_ts:
|
||||
return cached[0]
|
||||
|
||||
resp = _request_with_fallback(u, timeout_s=5)
|
||||
if resp is None:
|
||||
out = LinkVerification(httpStatus=0, finalUrl=u, redirectCount=0, isDirectEventPage=False, checkedAt=_now_iso())
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
final_url = str(resp.url or u)
|
||||
status = int(resp.status_code or 0)
|
||||
redirects = len(resp.history or [])
|
||||
text = resp.text[:12000] if isinstance(resp.text, str) else ""
|
||||
title_text = _extract_title(text)
|
||||
is_direct = (200 <= status < 300) and _is_direct_event_page(final_url, event_name, title_text=title_text)
|
||||
|
||||
if (200 <= status < 300) and not is_direct and text:
|
||||
for cand in _extract_candidate_links(final_url, text, event_name=event_name):
|
||||
second = _request_with_fallback(cand, timeout_s=5)
|
||||
if second is None:
|
||||
continue
|
||||
second_status = int(second.status_code or 0)
|
||||
second_final = str(second.url or cand)
|
||||
second_redirects = len(second.history or [])
|
||||
second_title = _extract_title((second.text or "")[:12000] if isinstance(second.text, str) else "")
|
||||
second_direct = (200 <= second_status < 300) and _is_direct_event_page(second_final, event_name, title_text=second_title)
|
||||
if second_direct:
|
||||
out = LinkVerification(
|
||||
httpStatus=second_status,
|
||||
finalUrl=second_final,
|
||||
redirectCount=second_redirects,
|
||||
isDirectEventPage=True,
|
||||
checkedAt=_now_iso(),
|
||||
)
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
out = LinkVerification(
|
||||
httpStatus=status,
|
||||
finalUrl=final_url,
|
||||
redirectCount=redirects,
|
||||
isDirectEventPage=is_direct,
|
||||
checkedAt=_now_iso(),
|
||||
)
|
||||
with _verify_lock:
|
||||
_verify_cache[u] = (out, now_ts + URL_VERIFY_TTL_SECONDS)
|
||||
return out
|
||||
|
||||
|
||||
def verification_to_dict(v: LinkVerification) -> dict[str, Any]:
|
||||
return asdict(v)
|
||||
|
||||
Reference in New Issue
Block a user