134 lines
4.3 KiB
Python
134 lines
4.3 KiB
Python
from datetime import datetime
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from app.models import RegionScope, WatchType
|
|
from app.providers.utils import normalize_search_text
|
|
|
|
|
|
class BarclaysArenaProvider:
|
|
source_name = "barclays_arena"
|
|
events_url = "https://www.barclays-arena.de/events/search"
|
|
|
|
def search_events(
|
|
self,
|
|
term: str,
|
|
watch_type: WatchType,
|
|
region_scope: RegionScope,
|
|
) -> list[dict]:
|
|
response = requests.get(
|
|
self.events_url,
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
timeout=30,
|
|
)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
normalized_term = normalize_search_text(term)
|
|
results: list[dict] = []
|
|
|
|
for heading in soup.find_all("h3"):
|
|
title = heading.get_text(" ", strip=True)
|
|
if not title:
|
|
continue
|
|
|
|
subtitle_el = heading.find_next_sibling("h4")
|
|
subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else ""
|
|
|
|
# Keep matching local to the actual heading/subtitle pair. Wider
|
|
# parent containers often contain several event cards.
|
|
haystack = normalize_search_text(f"{title} {subtitle}")
|
|
if normalized_term not in haystack:
|
|
continue
|
|
|
|
detail_link = self._find_card_link(heading)
|
|
if detail_link is None:
|
|
continue
|
|
|
|
date_text = self._find_card_date_text(heading)
|
|
event_date = self._parse_german_date(date_text)
|
|
href = detail_link["href"]
|
|
|
|
results.append(
|
|
{
|
|
"external_id": href,
|
|
"title": title,
|
|
"matched_term": term,
|
|
"venue_name": "Barclays Arena",
|
|
"city": "Hamburg",
|
|
"country_code": "DE",
|
|
"event_date": event_date,
|
|
"ticket_url": urljoin(self.events_url, href),
|
|
"image_url": None,
|
|
"raw_payload": {
|
|
"title": title,
|
|
"subtitle": subtitle,
|
|
"date_text": date_text,
|
|
"href": href,
|
|
},
|
|
}
|
|
)
|
|
|
|
unique_results: dict[str, dict] = {}
|
|
for result in results:
|
|
unique_results[result["external_id"]] = result
|
|
|
|
self.last_status = "ok"
|
|
self.last_message = (
|
|
f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'."
|
|
)
|
|
return list(unique_results.values())
|
|
|
|
def _find_card_link(self, heading):
|
|
link = heading.find_parent("a", href=re.compile(r"/events/"))
|
|
if link is not None:
|
|
return link
|
|
|
|
current = heading
|
|
for _ in range(5):
|
|
current = current.parent
|
|
if current is None:
|
|
return None
|
|
link = current.find("a", href=re.compile(r"/events/"))
|
|
if link is not None and heading in link.find_all("h3"):
|
|
return link
|
|
return None
|
|
|
|
def _find_card_date_text(self, heading) -> str | None:
|
|
current = heading
|
|
for _ in range(6):
|
|
current = current.previous_element
|
|
if current is None:
|
|
return None
|
|
text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
|
|
" ", strip=True
|
|
)
|
|
date_text = self._extract_date_text(text)
|
|
if date_text:
|
|
return date_text
|
|
return None
|
|
|
|
def _extract_date_text(self, text: str) -> str | None:
|
|
match = re.search(
|
|
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})",
|
|
text,
|
|
)
|
|
if match:
|
|
return match.group(2)
|
|
|
|
match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _parse_german_date(self, value: str | None) -> datetime | None:
|
|
if not value:
|
|
return None
|
|
try:
|
|
return datetime.strptime(value, "%d.%m.%Y")
|
|
except ValueError:
|
|
return None
|