eventlens/backend/app/source_scanner.py

import json
import re
from datetime import datetime
from html import unescape
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

from app.models import RegionScope, WatchItem, WatchSource, WatchType
from app.providers.utils import normalize_search_text


MONTH_ALIASES = {
    "jan": 1,
    "januar": 1,
    "feb": 2,
    "februar": 2,
    "maer": 3,
    "maerz": 3,
    "mar": 3,
    "maerz": 3,
    "apr": 4,
    "april": 4,
    "mai": 5,
    "jun": 6,
    "juni": 6,
    "jul": 7,
    "juli": 7,
    "aug": 8,
    "august": 8,
    "sep": 9,
    "sept": 9,
    "september": 9,
    "okt": 10,
    "oktober": 10,
    "nov": 11,
    "november": 11,
    "dez": 12,
    "dezember": 12,
}


class SourceScanner:
    headers = {
        "User-Agent": "eventlens/0.1 (+https://local)",
        "Accept": "text/html,application/xhtml+xml,application/json",
        "Accept-Language": "de-DE,de;q=0.9,en;q=0.7",
    }

    def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]:
        response = requests.get(
            source.url,
            headers=self.headers,
            timeout=30,
        )
        response.raise_for_status()

        content_type = response.headers.get("content-type", "")
        if "application/json" in content_type:
            return self._scan_json(watch_item, source, response.json())

        return self._scan_html(watch_item, source, response.text)

    def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]:
        events = self._extract_jsonld_events(payload)
        return self._events_from_jsonld(watch_item, source, events)

    def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]:
        soup = BeautifulSoup(html, "html.parser")
        jsonld_events = []

        for script in soup.find_all("script", type="application/ld+json"):
            raw_payload = script.string or script.get_text()
            if not raw_payload:
                continue
            try:
                payload = json.loads(unescape(raw_payload))
            except json.JSONDecodeError:
                continue
            jsonld_events.extend(self._extract_jsonld_events(payload))

        jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
        if jsonld_results:
            return jsonld_results

        return self._events_from_html_text(watch_item, source, soup)

    def _extract_jsonld_events(self, payload) -> list[dict]:
        events: list[dict] = []
        if isinstance(payload, list):
            for item in payload:
                events.extend(self._extract_jsonld_events(item))
            return events

        if not isinstance(payload, dict):
            return events

        graph = payload.get("@graph")
        if isinstance(graph, list):
            for item in graph:
                events.extend(self._extract_jsonld_events(item))

        item_type = payload.get("@type")
        if isinstance(item_type, list):
            is_event = "Event" in item_type
        else:
            is_event = item_type == "Event"
        if is_event:
            events.append(payload)

        return events

    def _events_from_jsonld(
        self,
        watch_item: WatchItem,
        source: WatchSource,
        events: list[dict],
    ) -> list[dict]:
        results: list[dict] = []
        normalized_term = normalize_search_text(watch_item.name)

        for event in events:
            title = event.get("name") or ""
            performers = self._extract_performer_names(event)
            haystack = normalize_search_text(" ".join([title] + performers))
            if normalized_term not in haystack:
                continue

            location = event.get("location") or {}
            address = location.get("address") or {}
            city = address.get("addressLocality") or location.get("name")
            if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
                continue

            event_date = self._parse_datetime(event.get("startDate"))
            if event_date and event_date.date() < datetime.utcnow().date():
                continue
            ticket_url = event.get("url") or source.url

            results.append(
                {
                    "external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
                    "title": title or watch_item.name,
                    "matched_term": watch_item.name,
                    "venue_name": location.get("name") or source.label,
                    "city": city,
                    "country_code": "DE",
                    "event_date": event_date,
                    "ticket_url": ticket_url,
                    "image_url": self._extract_image(event),
                    "raw_payload": event,
                }
            )

        return results

    def _events_from_html_text(
        self,
        watch_item: WatchItem,
        source: WatchSource,
        soup: BeautifulSoup,
    ) -> list[dict]:
        text = soup.get_text(" ", strip=True)
        normalized_text = normalize_search_text(text)
        normalized_term = normalize_search_text(watch_item.name)
        if normalized_term not in normalized_text:
            return []

        results: list[dict] = []
        seen_keys: set[str] = set()
        for context in self._find_matching_contexts(soup, watch_item):
            context_text = context.get_text(" ", strip=True)
            event_date = self._find_nearest_date(context_text, watch_item.name)
            if event_date is None:
                continue
            if event_date.date() < datetime.utcnow().date():
                continue
            if (
                watch_item.region_scope == RegionScope.hamburg
                and "hamburg" not in normalize_search_text(context_text)
            ):
                continue

            title = self._find_title(context, watch_item.name)
            link = self._find_nearest_link(context, watch_item.name, source.url) or source.url
            key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}"
            if key in seen_keys:
                continue
            seen_keys.add(key)

            results.append(
                {
                    "external_id": key,
                    "title": title,
                    "matched_term": watch_item.name,
                    "venue_name": self._find_venue(context_text, source.label),
                    "city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
                    "country_code": "DE",
                    "event_date": event_date,
                    "ticket_url": link,
                    "image_url": None,
                    "raw_payload": {
                        "source_url": source.url,
                        "parser": "html_text",
                        "context": context_text[:1000],
                    },
                }
            )

        return results

    def _extract_performer_names(self, event: dict) -> list[str]:
        performer = event.get("performer") or event.get("performers")
        if isinstance(performer, dict):
            return [performer.get("name", "")]
        if isinstance(performer, list):
            return [item.get("name", "") for item in performer if isinstance(item, dict)]
        return []

    def _extract_image(self, event: dict) -> str | None:
        image = event.get("image")
        if isinstance(image, str):
            return image
        if isinstance(image, list):
            for item in image:
                if isinstance(item, str):
                    return item
        return None

    def _parse_datetime(self, value: str | None) -> datetime | None:
        if not value:
            return None
        try:
            return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)
        except ValueError:
            pass
        for fmt in ("%d.%m.%Y", "%Y-%m-%d"):
            try:
                return datetime.strptime(value[:10], fmt)
            except ValueError:
                continue
        return None

    def _find_nearest_date(self, text: str, term: str) -> datetime | None:
        normalized_term = normalize_search_text(term)
        normalized_text = normalize_search_text(text)
        term_index = normalized_text.find(normalized_term)
        search_area = text
        if term_index >= 0:
            start = max(0, term_index - 300)
            end = min(len(text), term_index + 500)
            search_area = text[start:end]

        candidates: list[datetime] = []
        for pattern in (
            r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
            r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b",
            r"\b(\d{1,2}\.\d{1,2}\.)\b",
        ):
            for match in re.finditer(pattern, search_area):
                parsed = self._parse_german_date(match.group(1))
                if parsed:
                    candidates.append(parsed)

        month_name_pattern = (
            r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|ä)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|"
            r"jul(?:i)?|aug(?:ust)?|sep(?:t|tember)?|okt(?:ober)?|nov(?:ember)?|dez(?:ember)?"
        )
        for match in re.finditer(
            rf"\b(\d{{1,2}})\.?\s+({month_name_pattern})\.?\s*(\d{{4}})?\b",
            search_area,
            re.IGNORECASE,
        ):
            parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3))
            if parsed:
                candidates.append(parsed)
        for match in re.finditer(
            rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b",
            search_area,
            re.IGNORECASE,
        ):
            parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3))
            if parsed:
                candidates.append(parsed)

        future_candidates = [
            candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date()
        ]
        if future_candidates:
            return sorted(future_candidates)[0]
        return sorted(candidates)[0] if candidates else None

    def _parse_german_date(self, value: str) -> datetime | None:
        cleaned = value.strip()
        current_year = datetime.utcnow().year
        candidates = [cleaned]
        if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
            candidates.append(f"{cleaned}{current_year}")
            candidates.append(f"{cleaned}{current_year + 1}")
        elif re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}", cleaned):
            day, month, year = cleaned.split(".")
            candidates.append(f"{day}.{month}.20{year}")

        for candidate in candidates:
            try:
                parsed = datetime.strptime(candidate, "%d.%m.%Y")
                if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
                    continue
                return parsed
            except ValueError:
                continue
        return None

    def _parse_named_month_date(
        self,
        day_value: str,
        month_value: str,
        year_value: str | None,
    ) -> datetime | None:
        month = MONTH_ALIASES.get(normalize_search_text(month_value).rstrip("."))
        if month is None:
            return None

        day = int(day_value)
        current_year = datetime.utcnow().year
        years = [int(year_value)] if year_value else [current_year, current_year + 1]
        for year in years:
            try:
                parsed = datetime(year, month, day)
            except ValueError:
                continue
            if year_value or parsed.date() >= datetime.utcnow().date():
                return parsed
        return None

    def _find_matching_contexts(self, soup: BeautifulSoup, watch_item: WatchItem) -> list:
        normalized_term = normalize_search_text(watch_item.name)
        selectors = [
            "li.card",
            ".tourplan .row",
            "[class*=event]",
            "[class*=termin]",
            "article",
            "tr",
            "li",
            ".row",
        ]
        candidates = []
        seen_nodes = set()

        for selector in selectors:
            for node in soup.select(selector):
                if id(node) in seen_nodes:
                    continue
                seen_nodes.add(id(node))
                text = node.get_text(" ", strip=True)
                if normalized_term not in normalize_search_text(text):
                    continue
                if len(text) > 3500:
                    continue
                if self._find_nearest_date(text, watch_item.name):
                    candidates.append(node)

        if candidates:
            return candidates

        fallback = self._find_best_context(soup, watch_item.name)
        return [fallback] if fallback is not None else []

    def _find_venue(self, text: str, default: str) -> str:
        lines = [line.strip() for line in re.split(r"\s{2,}|\n|\r", text) if line.strip()]
        for line in lines:
            normalized = normalize_search_text(line)
            if "hamburg" in normalized and len(line) <= 120:
                return line
        return default

    def _find_best_context(self, soup: BeautifulSoup, term: str):
        normalized_term = normalize_search_text(term)
        candidates = []
        for node in soup.find_all(string=True):
            if normalized_term in normalize_search_text(str(node)):
                parent = node.parent
                if parent is None:
                    continue
                best_parent = self._climb_to_context_with_date(parent, term)
                text = best_parent.get_text(" ", strip=True)
                candidates.append(
                    (
                        0 if self._find_nearest_date(text, term) else 1,
                        len(text),
                        best_parent,
                    )
                )

        if not candidates:
            return None

        candidates.sort(key=lambda item: (item[0], item[1]))
        return candidates[0][2]

    def _climb_to_context_with_date(self, node, term: str):
        current = node
        best = node
        for _ in range(6):
            if current is None:
                break
            context_text = current.get_text(" ", strip=True)
            if self._find_nearest_date(context_text, term):
                return current
            best = current
            current = current.parent
        return best

    def _find_title(self, soup: BeautifulSoup, term: str) -> str:
        if soup is None:
            return term
        normalized_term = normalize_search_text(term)
        for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
            title = heading.get_text(" ", strip=True)
            if normalized_term in normalize_search_text(title):
                return title

        text = soup.get_text(" ", strip=True)
        dated_match = re.search(
            r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
            + re.escape(term)
            + r".{0,100})",
            text,
            re.IGNORECASE,
        )
        if dated_match:
            return " ".join(dated_match.group(1).split())

        match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
        if match:
            return " ".join(match.group(1).split())
        return term

    def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
        normalized_term = normalize_search_text(term)
        for link in soup.find_all("a", href=True):
            if normalized_term in normalize_search_text(link.get_text(" ", strip=True)):
                return urljoin(base_url, link["href"])
        return None