eventlens/backend/app/providers/barclays_arena.py

from datetime import datetime
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

from app.models import RegionScope, WatchType
from app.providers.utils import normalize_search_text


class BarclaysArenaProvider:
    source_name = "barclays_arena"
    events_url = "https://www.barclays-arena.de/events/search"

    def search_events(
        self,
        term: str,
        watch_type: WatchType,
        region_scope: RegionScope,
    ) -> list[dict]:
        response = requests.get(
            self.events_url,
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=30,
        )
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        normalized_term = normalize_search_text(term)
        results: list[dict] = []

        for heading in soup.find_all("h3"):
            title = heading.get_text(" ", strip=True)
            if not title:
                continue

            subtitle_el = heading.find_next_sibling("h4")
            subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else ""

            # Keep matching local to the actual heading/subtitle pair. Wider
            # parent containers often contain several event cards.
            haystack = normalize_search_text(f"{title} {subtitle}")
            if normalized_term not in haystack:
                continue

            detail_link = self._find_card_link(heading)
            if detail_link is None:
                continue

            date_text = self._find_card_date_text(heading)
            event_date = self._parse_german_date(date_text)
            href = detail_link["href"]

            results.append(
                {
                    "external_id": href,
                    "title": title,
                    "matched_term": term,
                    "venue_name": "Barclays Arena",
                    "city": "Hamburg",
                    "country_code": "DE",
                    "event_date": event_date,
                    "ticket_url": urljoin(self.events_url, href),
                    "image_url": None,
                    "raw_payload": {
                        "title": title,
                        "subtitle": subtitle,
                        "date_text": date_text,
                        "href": href,
                    },
                }
            )

        unique_results: dict[str, dict] = {}
        for result in results:
            unique_results[result["external_id"]] = result

        self.last_status = "ok"
        self.last_message = (
            f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'."
        )
        return list(unique_results.values())

    def _find_card_link(self, heading):
        link = heading.find_parent("a", href=re.compile(r"/events/"))
        if link is not None:
            return link

        current = heading
        for _ in range(5):
            current = current.parent
            if current is None:
                return None
            link = current.find("a", href=re.compile(r"/events/"))
            if link is not None and heading in link.find_all("h3"):
                return link
        return None

    def _find_card_date_text(self, heading) -> str | None:
        current = heading
        for _ in range(6):
            current = current.previous_element
            if current is None:
                return None
            text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
                " ", strip=True
            )
            date_text = self._extract_date_text(text)
            if date_text:
                return date_text
        return None

    def _extract_date_text(self, text: str) -> str | None:
        match = re.search(
            r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})",
            text,
        )
        if match:
            return match.group(2)

        match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text)
        if match:
            return match.group(1)
        return None

    def _parse_german_date(self, value: str | None) -> datetime | None:
        if not value:
            return None
        try:
            return datetime.strptime(value, "%d.%m.%Y")
        except ValueError:
            return None