diff --git a/README.md b/README.md index 38e3ff8..9dd3727 100644 --- a/README.md +++ b/README.md @@ -114,3 +114,4 @@ sudo docker compose up -d --build - Bandsintown benoetigt eine echte, von Bandsintown freigeschaltete App-ID. Ohne diese wird der Provider deaktiviert oder als `blocked` angezeigt. - Barclays Arena wird ueber die offizielle Eventseite der Arena abgefragt. - Fabrik wird ueber die offizielle Veranstaltungsseite der Fabrik Hamburg abgefragt. +- Fuer robuste persoenliche Ueberwachung koennen pro Watchlist-Eintrag direkte Quellen-URLs hinterlegt werden. Diese werden beim Sync gezielt per JSON-LD und HTML-Textscan durchsucht. diff --git a/backend/app/frontend/static/app.js b/backend/app/frontend/static/app.js index 2cb76a2..35bc242 100644 --- a/backend/app/frontend/static/app.js +++ b/backend/app/frontend/static/app.js @@ -3,6 +3,7 @@ const state = { events: [], notifications: [], providerStatuses: [], + watchSources: [], }; const watchItemsEl = document.querySelector("#watch-items"); @@ -92,6 +93,12 @@ function renderStats() { } function prettifyProviderName(value) { + if (value?.startsWith("source:")) { + const sourceId = Number(value.split(":")[1]); + const source = state.watchSources.find((entry) => entry.id === sourceId); + return source?.label || "Direkte Quelle"; + } + const names = { ticketmaster: "Ticketmaster", bandsintown: "Bandsintown", @@ -206,12 +213,71 @@ function renderWatchItems() {

${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}

+ ${renderSourceList(item.id)} +
+ + + +
` ) .join(""); } +function renderSourceList(watchItemId) { + const sources = state.watchSources.filter((source) => source.watch_item_id === watchItemId); + if (!sources.length) { + return '
Noch keine direkten Quellen hinterlegt.
'; + } + + return ` +
+ ${sources + .map( + (source) => ` +
+
+ ${escapeHtml(source.label || "Quelle")} + + ${escapeHtml(source.url)} + +
+ ${escapeHtml(source.last_status)} + ${escapeHtml(source.last_message || "Noch nicht gescannt.")} +
+
+
+ + +
+
+ ` + ) + .join("")} +
+ `; +} + function getWatchNameById(id) { return state.watchItems.find((item) => item.id === id)?.name || `Watch #${id}`; } @@ -270,6 +336,13 @@ function renderEvents() { > ${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"} +
@@ -328,17 +401,19 @@ function updateSyncStatus(message) { } async function loadData() { - const [watchItems, events, notifications, providerStatuses] = await Promise.all([ + const [watchItems, events, notifications, providerStatuses, watchSources] = await Promise.all([ apiFetch("/watch-items"), apiFetch("/events"), apiFetch("/notifications"), apiFetch("/provider-statuses"), + apiFetch("/watch-sources"), ]); state.watchItems = watchItems; state.events = events; state.notifications = notifications; state.providerStatuses = providerStatuses; + state.watchSources = watchSources; renderStats(); renderWatchItems(); @@ -433,7 +508,60 @@ document.addEventListener("click", async (event) => { }); await loadData(); showToast("Ticketstatus aktualisiert."); + return; } + + if (action === "delete-event") { + await apiFetch(`/events/${id}`, { method: "DELETE" }); + await loadData(); + showToast("Event geloescht."); + return; + } + + if (action === "delete-source") { + await apiFetch(`/watch-sources/${id}`, { method: "DELETE" }); + await loadData(); + showToast("Quelle geloescht."); + return; + } + + if (action === "toggle-source") { + const source = state.watchSources.find((entry) => entry.id === Number(id)); + await apiFetch(`/watch-sources/${id}`, { + method: "PATCH", + body: JSON.stringify({ is_active: !source.is_active }), + }); + await loadData(); + showToast("Quellenstatus aktualisiert."); + } + } catch (error) { + showToast(error.message); + } +}); + +document.addEventListener("submit", async (event) => { + const form = event.target.closest(".source-form"); + if (!form) { + return; + } + + event.preventDefault(); + const watchId = form.dataset.watchId; + const formData = new FormData(form); + const payload = { + label: formData.get("label")?.toString().trim() || null, + url: formData.get("url")?.toString().trim(), + parser_type: "auto", + }; + + try { + await apiFetch(`/watch-items/${watchId}/sources`, { + method: "POST", + body: JSON.stringify(payload), + }); + form.reset(); + await loadData(); + showToast("Quelle hinzugefuegt."); } catch (error) { showToast(error.message); } diff --git a/backend/app/frontend/static/styles.css b/backend/app/frontend/static/styles.css index 86833a2..7420e53 100644 --- a/backend/app/frontend/static/styles.css +++ b/backend/app/frontend/static/styles.css @@ -381,6 +381,37 @@ button:hover, line-height: 1.55; } +.source-list { + display: grid; + gap: 10px; + margin-top: 16px; +} + +.source-row { + display: flex; + align-items: start; + justify-content: space-between; + gap: 16px; + padding: 14px; + border-radius: var(--radius-sm); + background: rgba(46, 39, 30, 0.05); +} + +.source-row a { + display: block; + max-width: 46ch; + margin: 4px 0 8px; + overflow-wrap: anywhere; + color: var(--primary-dark); +} + +.source-form { + display: grid; + grid-template-columns: 0.7fr 1.4fr auto; + gap: 10px; + margin-top: 14px; +} + .action-button { min-height: 38px; padding: 0 14px; @@ -467,6 +498,7 @@ button:hover, } .watch-form, + .source-form, .status-panel { grid-template-columns: 1fr; } diff --git a/backend/app/main.py b/backend/app/main.py index ade1aab..6edba14 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -8,7 +8,7 @@ from sqlalchemy.orm import Session from app.config import settings from app.database import Base, engine, get_db -from app.models import TrackedEvent, WatchItem +from app.models import TrackedEvent, WatchItem, WatchSource from app.scheduler import start_scheduler from app.schemas import ( NotificationLogRead, @@ -19,11 +19,15 @@ from app.schemas import ( WatchItemCreate, WatchItemRead, WatchItemUpdate, + WatchSourceCreate, + WatchSourceRead, + WatchSourceUpdate, ) from app.services import ( list_events, list_notifications, list_provider_statuses, + list_watch_sources, list_watch_items, run_sync, ) @@ -114,6 +118,66 @@ def delete_watch_item(watch_item_id: int, db: Session = Depends(get_db)): db.commit() +@app.get("/watch-sources", response_model=list[WatchSourceRead]) +def get_watch_sources(watch_item_id: int | None = None, db: Session = Depends(get_db)): + return list_watch_sources(db, watch_item_id) + + +@app.post( + "/watch-items/{watch_item_id}/sources", + response_model=WatchSourceRead, + status_code=201, +) +def create_watch_source( + watch_item_id: int, + payload: WatchSourceCreate, + db: Session = Depends(get_db), +): + watch_item = db.get(WatchItem, watch_item_id) + if watch_item is None: + raise HTTPException(status_code=404, detail="Watch item nicht gefunden.") + + source = WatchSource( + watch_item=watch_item, + label=payload.label, + url=payload.url, + parser_type=payload.parser_type, + ) + db.add(source) + db.commit() + db.refresh(source) + return source + + +@app.patch("/watch-sources/{source_id}", response_model=WatchSourceRead) +def update_watch_source( + source_id: int, + payload: WatchSourceUpdate, + db: Session = Depends(get_db), +): + source = db.get(WatchSource, source_id) + if source is None: + raise HTTPException(status_code=404, detail="Quelle nicht gefunden.") + + updates = payload.model_dump(exclude_unset=True) + for field_name, value in updates.items(): + setattr(source, field_name, value) + source.updated_at = datetime.utcnow() + db.commit() + db.refresh(source) + return source + + +@app.delete("/watch-sources/{source_id}", status_code=204) +def delete_watch_source(source_id: int, db: Session = Depends(get_db)): + source = db.get(WatchSource, source_id) + if source is None: + raise HTTPException(status_code=404, detail="Quelle nicht gefunden.") + + db.delete(source) + db.commit() + + @app.get("/events", response_model=list[TrackedEventRead]) def get_events(db: Session = Depends(get_db)): return list_events(db) @@ -137,6 +201,16 @@ def update_purchase_status( return tracked_event +@app.delete("/events/{event_id}", status_code=204) +def delete_event(event_id: int, db: Session = Depends(get_db)): + tracked_event = db.get(TrackedEvent, event_id) + if tracked_event is None: + raise HTTPException(status_code=404, detail="Event nicht gefunden.") + + db.delete(tracked_event) + db.commit() + + @app.get("/notifications", response_model=list[NotificationLogRead]) def get_notifications(db: Session = Depends(get_db)): return list_notifications(db) diff --git a/backend/app/models.py b/backend/app/models.py index 03a7bcc..6cb84c8 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -35,6 +35,13 @@ class ProviderStatusType(str, Enum): error = "error" +class SourceStatusType(str, Enum): + pending = "pending" + ok = "ok" + no_match = "no_match" + error = "error" + + class WatchItem(Base): __tablename__ = "watch_items" @@ -57,6 +64,36 @@ class WatchItem(Base): tracked_events: Mapped[list["TrackedEvent"]] = relationship( back_populates="watch_item", cascade="all, delete-orphan" ) + sources: Mapped[list["WatchSource"]] = relationship( + back_populates="watch_item", cascade="all, delete-orphan" + ) + + +class WatchSource(Base): + __tablename__ = "watch_sources" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True) + watch_item_id: Mapped[int] = mapped_column(ForeignKey("watch_items.id"), nullable=False) + label: Mapped[str | None] = mapped_column(String(255), nullable=True) + url: Mapped[str] = mapped_column(String(1024), nullable=False) + parser_type: Mapped[str] = mapped_column(String(50), default="auto", nullable=False) + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + last_status: Mapped[SourceStatusType] = mapped_column( + SqlEnum(SourceStatusType), default=SourceStatusType.pending, nullable=False + ) + last_message: Mapped[str | None] = mapped_column(Text, nullable=True) + last_checked_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime, default=datetime.utcnow, nullable=False + ) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + default=datetime.utcnow, + onupdate=datetime.utcnow, + nullable=False, + ) + + watch_item: Mapped[WatchItem] = relationship(back_populates="sources") class TrackedEvent(Base): diff --git a/backend/app/providers/barclays_arena.py b/backend/app/providers/barclays_arena.py index bcdf841..3292477 100644 --- a/backend/app/providers/barclays_arena.py +++ b/backend/app/providers/barclays_arena.py @@ -1,4 +1,5 @@ from datetime import datetime +import re from urllib.parse import urljoin import requests @@ -10,7 +11,7 @@ from app.providers.utils import normalize_search_text class BarclaysArenaProvider: source_name = "barclays_arena" - events_url = "https://www.barclays-arena.de/events" + events_url = "https://www.barclays-arena.de/events/search" def search_events( self, @@ -29,73 +30,104 @@ class BarclaysArenaProvider: normalized_term = normalize_search_text(term) results: list[dict] = [] - headings = soup.find_all("h3") - for heading in headings: + for heading in soup.find_all("h3"): title = heading.get_text(" ", strip=True) if not title: continue - subtitle = "" - subtitle_el = heading.find_next("h4") - if subtitle_el: - subtitle = subtitle_el.get_text(" ", strip=True) + subtitle_el = heading.find_next_sibling("h4") + subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else "" + # Keep matching local to the actual heading/subtitle pair. Wider + # parent containers often contain several event cards. haystack = normalize_search_text(f"{title} {subtitle}") if normalized_term not in haystack: continue - date_text = self._find_previous_date_text(heading) - event_date = self._parse_german_date(date_text) - - link = heading.find_previous("a", href=True) - if link is None: + detail_link = self._find_card_link(heading) + if detail_link is None: continue + date_text = self._find_card_date_text(heading) + event_date = self._parse_german_date(date_text) + href = detail_link["href"] + results.append( { - "external_id": link["href"], + "external_id": href, "title": title, "matched_term": term, "venue_name": "Barclays Arena", "city": "Hamburg", "country_code": "DE", "event_date": event_date, - "ticket_url": urljoin(self.events_url, link["href"]), + "ticket_url": urljoin(self.events_url, href), "image_url": None, "raw_payload": { "title": title, "subtitle": subtitle, "date_text": date_text, - "href": link["href"], + "href": href, }, } ) + unique_results: dict[str, dict] = {} + for result in results: + unique_results[result["external_id"]] = result + self.last_status = "ok" self.last_message = ( - f"Barclays Arena returned {len(results)} matched events for term '{term}'." + f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'." ) - return results + return list(unique_results.values()) - def _find_previous_date_text(self, heading) -> str | None: - current = heading.previous_sibling - while current is not None: + def _find_card_link(self, heading): + link = heading.find_parent("a", href=re.compile(r"/events/")) + if link is not None: + return link + + current = heading + for _ in range(5): + current = current.parent + if current is None: + return None + link = current.find("a", href=re.compile(r"/events/")) + if link is not None and heading in link.find_all("h3"): + return link + return None + + def _find_card_date_text(self, heading) -> str | None: + current = heading + for _ in range(6): + current = current.previous_element + if current is None: + return None text = getattr(current, "get_text", lambda *args, **kwargs: str(current))( " ", strip=True ) - if text and "|" in text: - return text - current = current.previous_sibling + date_text = self._extract_date_text(text) + if date_text: + return date_text + return None + + def _extract_date_text(self, text: str) -> str | None: + match = re.search( + r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})", + text, + ) + if match: + return match.group(2) + + match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text) + if match: + return match.group(1) return None def _parse_german_date(self, value: str | None) -> datetime | None: if not value: return None - parts = [part.strip() for part in value.split("|")] - if len(parts) < 2: - return None try: - return datetime.strptime(parts[1], "%d.%m.%Y") + return datetime.strptime(value, "%d.%m.%Y") except ValueError: return None - diff --git a/backend/app/schemas.py b/backend/app/schemas.py index 611efb4..a96c167 100644 --- a/backend/app/schemas.py +++ b/backend/app/schemas.py @@ -7,6 +7,7 @@ from app.models import ( NotificationType, ProviderStatusType, RegionScope, + SourceStatusType, WatchType, ) @@ -39,6 +40,35 @@ class WatchItemRead(BaseModel): updated_at: datetime +class WatchSourceCreate(BaseModel): + label: str | None = Field(default=None, max_length=255) + url: str = Field(min_length=8, max_length=1024) + parser_type: str = "auto" + + +class WatchSourceUpdate(BaseModel): + label: str | None = Field(default=None, max_length=255) + url: str | None = Field(default=None, min_length=8, max_length=1024) + parser_type: str | None = None + is_active: bool | None = None + + +class WatchSourceRead(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + watch_item_id: int + label: str | None + url: str + parser_type: str + is_active: bool + last_status: SourceStatusType + last_message: str | None + last_checked_at: datetime | None + created_at: datetime + updated_at: datetime + + class PurchaseUpdate(BaseModel): is_ticket_purchased: bool diff --git a/backend/app/services.py b/backend/app/services.py index 85cc90b..a09ddca 100644 --- a/backend/app/services.py +++ b/backend/app/services.py @@ -10,12 +10,15 @@ from app.models import ( NotificationType, ProviderStatus, ProviderStatusType, + SourceStatusType, TrackedEvent, WatchItem, + WatchSource, ) from app.notifications import send_email_notification from app.providers.registry import get_providers from app.schemas import SyncResult +from app.source_scanner import SourceScanner logger = logging.getLogger(__name__) @@ -71,6 +74,13 @@ def list_provider_statuses(db: Session) -> list[ProviderStatus]: return list(db.scalars(select(ProviderStatus).order_by(ProviderStatus.provider_name))) +def list_watch_sources(db: Session, watch_item_id: int | None = None) -> list[WatchSource]: + stmt = select(WatchSource).order_by(WatchSource.created_at) + if watch_item_id is not None: + stmt = stmt.where(WatchSource.watch_item_id == watch_item_id) + return list(db.scalars(stmt)) + + def update_provider_status( db: Session, provider_name: str, @@ -261,6 +271,7 @@ def upsert_event( def run_sync(db: Session) -> SyncResult: providers = get_providers() + source_scanner = SourceScanner() provider_states = { provider.source_name: init_provider_sync_state(provider.source_name) for provider in providers @@ -275,6 +286,75 @@ def run_sync(db: Session) -> SyncResult: notifications_skipped = 0 for watch_item in active_items: + active_sources = [source for source in watch_item.sources if source.is_active] + for source in active_sources: + try: + events = source_scanner.scan(watch_item, source) + source.last_status = ( + SourceStatusType.ok if events else SourceStatusType.no_match + ) + source.last_message = ( + f"{len(events)} passende Events gefunden." + if events + else "Keine passenden Events auf dieser Quelle gefunden." + ) + source.last_checked_at = datetime.utcnow() + except Exception as exc: + logger.exception( + "Source scan failed for watch_item=%s source=%s", + watch_item.name, + source.url, + ) + db.rollback() + source.last_status = SourceStatusType.error + source.last_message = f"Scan fehlgeschlagen: {exc}" + source.last_checked_at = datetime.utcnow() + db.add(source) + db.commit() + continue + + for event_data in events: + tracked_event, is_new = upsert_event( + db=db, + watch_item=watch_item, + provider_name=f"source:{source.id}", + event_data=event_data, + ) + if is_new: + new_events += 1 + else: + updated_events += 1 + + should_notify = ( + is_new + and tracked_event.discovery_notified_at is None + and not has_equivalent_existing_event(db, tracked_event) + ) + if should_notify: + status = send_email_notification( + db=db, + tracked_event=tracked_event, + notification_type=NotificationType.discovery, + subject=f"Neuer Termin fuer {watch_item.name}", + body=( + f"Es wurde ein neuer Termin fuer '{watch_item.name}' gefunden.\n\n" + f"Quelle: {source.label or source.url}\n" + f"Titel: {tracked_event.title}\n" + f"Ort: {tracked_event.venue_name or 'unbekannt'}\n" + f"Stadt: {tracked_event.city or 'unbekannt'}\n" + f"Datum: {tracked_event.event_date or 'unbekannt'}\n" + f"Tickets: {tracked_event.ticket_url or 'keine URL'}\n" + ), + ) + if status == NotificationStatus.sent: + tracked_event.discovery_notified_at = datetime.utcnow() + notifications_sent += 1 + else: + notifications_skipped += 1 + + db.add(source) + db.commit() + for provider in providers: try: events = provider.search_events( diff --git a/backend/app/source_scanner.py b/backend/app/source_scanner.py new file mode 100644 index 0000000..4fec107 --- /dev/null +++ b/backend/app/source_scanner.py @@ -0,0 +1,446 @@ +import json +import re +from datetime import datetime +from html import unescape +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup + +from app.models import RegionScope, WatchItem, WatchSource, WatchType +from app.providers.utils import normalize_search_text + + +MONTH_ALIASES = { + "jan": 1, + "januar": 1, + "feb": 2, + "februar": 2, + "maer": 3, + "maerz": 3, + "mar": 3, + "maerz": 3, + "apr": 4, + "april": 4, + "mai": 5, + "jun": 6, + "juni": 6, + "jul": 7, + "juli": 7, + "aug": 8, + "august": 8, + "sep": 9, + "sept": 9, + "september": 9, + "okt": 10, + "oktober": 10, + "nov": 11, + "november": 11, + "dez": 12, + "dezember": 12, +} + + +class SourceScanner: + headers = { + "User-Agent": "eventlens/0.1 (+https://local)", + "Accept": "text/html,application/xhtml+xml,application/json", + "Accept-Language": "de-DE,de;q=0.9,en;q=0.7", + } + + def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]: + response = requests.get( + source.url, + headers=self.headers, + timeout=30, + ) + response.raise_for_status() + + content_type = response.headers.get("content-type", "") + if "application/json" in content_type: + return self._scan_json(watch_item, source, response.json()) + + return self._scan_html(watch_item, source, response.text) + + def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]: + events = self._extract_jsonld_events(payload) + return self._events_from_jsonld(watch_item, source, events) + + def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]: + soup = BeautifulSoup(html, "html.parser") + jsonld_events = [] + + for script in soup.find_all("script", type="application/ld+json"): + raw_payload = script.string or script.get_text() + if not raw_payload: + continue + try: + payload = json.loads(unescape(raw_payload)) + except json.JSONDecodeError: + continue + jsonld_events.extend(self._extract_jsonld_events(payload)) + + jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events) + if jsonld_results: + return jsonld_results + + return self._events_from_html_text(watch_item, source, soup) + + def _extract_jsonld_events(self, payload) -> list[dict]: + events: list[dict] = [] + if isinstance(payload, list): + for item in payload: + events.extend(self._extract_jsonld_events(item)) + return events + + if not isinstance(payload, dict): + return events + + graph = payload.get("@graph") + if isinstance(graph, list): + for item in graph: + events.extend(self._extract_jsonld_events(item)) + + item_type = payload.get("@type") + if isinstance(item_type, list): + is_event = "Event" in item_type + else: + is_event = item_type == "Event" + if is_event: + events.append(payload) + + return events + + def _events_from_jsonld( + self, + watch_item: WatchItem, + source: WatchSource, + events: list[dict], + ) -> list[dict]: + results: list[dict] = [] + normalized_term = normalize_search_text(watch_item.name) + + for event in events: + title = event.get("name") or "" + performers = self._extract_performer_names(event) + haystack = normalize_search_text(" ".join([title] + performers)) + if normalized_term not in haystack: + continue + + location = event.get("location") or {} + address = location.get("address") or {} + city = address.get("addressLocality") or location.get("name") + if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg": + continue + + event_date = self._parse_datetime(event.get("startDate")) + if event_date and event_date.date() < datetime.utcnow().date(): + continue + ticket_url = event.get("url") or source.url + + results.append( + { + "external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"), + "title": title or watch_item.name, + "matched_term": watch_item.name, + "venue_name": location.get("name") or source.label, + "city": city, + "country_code": "DE", + "event_date": event_date, + "ticket_url": ticket_url, + "image_url": self._extract_image(event), + "raw_payload": event, + } + ) + + return results + + def _events_from_html_text( + self, + watch_item: WatchItem, + source: WatchSource, + soup: BeautifulSoup, + ) -> list[dict]: + text = soup.get_text(" ", strip=True) + normalized_text = normalize_search_text(text) + normalized_term = normalize_search_text(watch_item.name) + if normalized_term not in normalized_text: + return [] + + results: list[dict] = [] + seen_keys: set[str] = set() + for context in self._find_matching_contexts(soup, watch_item): + context_text = context.get_text(" ", strip=True) + event_date = self._find_nearest_date(context_text, watch_item.name) + if event_date is None: + continue + if event_date.date() < datetime.utcnow().date(): + continue + if ( + watch_item.region_scope == RegionScope.hamburg + and "hamburg" not in normalize_search_text(context_text) + ): + continue + + title = self._find_title(context, watch_item.name) + link = self._find_nearest_link(context, watch_item.name, source.url) or source.url + key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}" + if key in seen_keys: + continue + seen_keys.add(key) + + results.append( + { + "external_id": key, + "title": title, + "matched_term": watch_item.name, + "venue_name": self._find_venue(context_text, source.label), + "city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None, + "country_code": "DE", + "event_date": event_date, + "ticket_url": link, + "image_url": None, + "raw_payload": { + "source_url": source.url, + "parser": "html_text", + "context": context_text[:1000], + }, + } + ) + + return results + + def _extract_performer_names(self, event: dict) -> list[str]: + performer = event.get("performer") or event.get("performers") + if isinstance(performer, dict): + return [performer.get("name", "")] + if isinstance(performer, list): + return [item.get("name", "") for item in performer if isinstance(item, dict)] + return [] + + def _extract_image(self, event: dict) -> str | None: + image = event.get("image") + if isinstance(image, str): + return image + if isinstance(image, list): + for item in image: + if isinstance(item, str): + return item + return None + + def _parse_datetime(self, value: str | None) -> datetime | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None) + except ValueError: + pass + for fmt in ("%d.%m.%Y", "%Y-%m-%d"): + try: + return datetime.strptime(value[:10], fmt) + except ValueError: + continue + return None + + def _find_nearest_date(self, text: str, term: str) -> datetime | None: + normalized_term = normalize_search_text(term) + normalized_text = normalize_search_text(text) + term_index = normalized_text.find(normalized_term) + search_area = text + if term_index >= 0: + start = max(0, term_index - 300) + end = min(len(text), term_index + 500) + search_area = text[start:end] + + candidates: list[datetime] = [] + for pattern in ( + r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b", + r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b", + r"\b(\d{1,2}\.\d{1,2}\.)\b", + ): + for match in re.finditer(pattern, search_area): + parsed = self._parse_german_date(match.group(1)) + if parsed: + candidates.append(parsed) + + month_name_pattern = ( + r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|รค)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|" + r"jul(?:i)?|aug(?:ust)?|sep(?:t|tember)?|okt(?:ober)?|nov(?:ember)?|dez(?:ember)?" + ) + for match in re.finditer( + rf"\b(\d{{1,2}})\.?\s+({month_name_pattern})\.?\s*(\d{{4}})?\b", + search_area, + re.IGNORECASE, + ): + parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3)) + if parsed: + candidates.append(parsed) + for match in re.finditer( + rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b", + search_area, + re.IGNORECASE, + ): + parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3)) + if parsed: + candidates.append(parsed) + + future_candidates = [ + candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date() + ] + if future_candidates: + return sorted(future_candidates)[0] + return sorted(candidates)[0] if candidates else None + + def _parse_german_date(self, value: str) -> datetime | None: + cleaned = value.strip() + current_year = datetime.utcnow().year + candidates = [cleaned] + if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned): + candidates.append(f"{cleaned}{current_year}") + candidates.append(f"{cleaned}{current_year + 1}") + elif re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}", cleaned): + day, month, year = cleaned.split(".") + candidates.append(f"{day}.{month}.20{year}") + + for candidate in candidates: + try: + parsed = datetime.strptime(candidate, "%d.%m.%Y") + if parsed.date() < datetime.utcnow().date() and candidate != cleaned: + continue + return parsed + except ValueError: + continue + return None + + def _parse_named_month_date( + self, + day_value: str, + month_value: str, + year_value: str | None, + ) -> datetime | None: + month = MONTH_ALIASES.get(normalize_search_text(month_value).rstrip(".")) + if month is None: + return None + + day = int(day_value) + current_year = datetime.utcnow().year + years = [int(year_value)] if year_value else [current_year, current_year + 1] + for year in years: + try: + parsed = datetime(year, month, day) + except ValueError: + continue + if year_value or parsed.date() >= datetime.utcnow().date(): + return parsed + return None + + def _find_matching_contexts(self, soup: BeautifulSoup, watch_item: WatchItem) -> list: + normalized_term = normalize_search_text(watch_item.name) + selectors = [ + "li.card", + ".tourplan .row", + "[class*=event]", + "[class*=termin]", + "article", + "tr", + "li", + ".row", + ] + candidates = [] + seen_nodes = set() + + for selector in selectors: + for node in soup.select(selector): + if id(node) in seen_nodes: + continue + seen_nodes.add(id(node)) + text = node.get_text(" ", strip=True) + if normalized_term not in normalize_search_text(text): + continue + if len(text) > 3500: + continue + if self._find_nearest_date(text, watch_item.name): + candidates.append(node) + + if candidates: + return candidates + + fallback = self._find_best_context(soup, watch_item.name) + return [fallback] if fallback is not None else [] + + def _find_venue(self, text: str, default: str) -> str: + lines = [line.strip() for line in re.split(r"\s{2,}|\n|\r", text) if line.strip()] + for line in lines: + normalized = normalize_search_text(line) + if "hamburg" in normalized and len(line) <= 120: + return line + return default + + def _find_best_context(self, soup: BeautifulSoup, term: str): + normalized_term = normalize_search_text(term) + candidates = [] + for node in soup.find_all(string=True): + if normalized_term in normalize_search_text(str(node)): + parent = node.parent + if parent is None: + continue + best_parent = self._climb_to_context_with_date(parent, term) + text = best_parent.get_text(" ", strip=True) + candidates.append( + ( + 0 if self._find_nearest_date(text, term) else 1, + len(text), + best_parent, + ) + ) + + if not candidates: + return None + + candidates.sort(key=lambda item: (item[0], item[1])) + return candidates[0][2] + + def _climb_to_context_with_date(self, node, term: str): + current = node + best = node + for _ in range(6): + if current is None: + break + context_text = current.get_text(" ", strip=True) + if self._find_nearest_date(context_text, term): + return current + best = current + current = current.parent + return best + + def _find_title(self, soup: BeautifulSoup, term: str) -> str: + if soup is None: + return term + normalized_term = normalize_search_text(term) + for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]): + title = heading.get_text(" ", strip=True) + if normalized_term in normalize_search_text(title): + return title + + text = soup.get_text(" ", strip=True) + dated_match = re.search( + r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}" + + re.escape(term) + + r".{0,100})", + text, + re.IGNORECASE, + ) + if dated_match: + return " ".join(dated_match.group(1).split()) + + match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE) + if match: + return " ".join(match.group(1).split()) + return term + + def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None: + normalized_term = normalize_search_text(term) + for link in soup.find_all("a", href=True): + if normalized_term in normalize_search_text(link.get_text(" ", strip=True)): + return urljoin(base_url, link["href"]) + return None