diff --git a/README.md b/README.md
index 38e3ff8..9dd3727 100644
--- a/README.md
+++ b/README.md
@@ -114,3 +114,4 @@ sudo docker compose up -d --build
- Bandsintown benoetigt eine echte, von Bandsintown freigeschaltete App-ID. Ohne diese wird der Provider deaktiviert oder als `blocked` angezeigt.
- Barclays Arena wird ueber die offizielle Eventseite der Arena abgefragt.
- Fabrik wird ueber die offizielle Veranstaltungsseite der Fabrik Hamburg abgefragt.
+- Fuer robuste persoenliche Ueberwachung koennen pro Watchlist-Eintrag direkte Quellen-URLs hinterlegt werden. Diese werden beim Sync gezielt per JSON-LD und HTML-Textscan durchsucht.
diff --git a/backend/app/frontend/static/app.js b/backend/app/frontend/static/app.js
index 2cb76a2..35bc242 100644
--- a/backend/app/frontend/static/app.js
+++ b/backend/app/frontend/static/app.js
@@ -3,6 +3,7 @@ const state = {
events: [],
notifications: [],
providerStatuses: [],
+ watchSources: [],
};
const watchItemsEl = document.querySelector("#watch-items");
@@ -92,6 +93,12 @@ function renderStats() {
}
function prettifyProviderName(value) {
+ if (value?.startsWith("source:")) {
+ const sourceId = Number(value.split(":")[1]);
+ const source = state.watchSources.find((entry) => entry.id === sourceId);
+ return source?.label || "Direkte Quelle";
+ }
+
const names = {
ticketmaster: "Ticketmaster",
bandsintown: "Bandsintown",
@@ -206,12 +213,71 @@ function renderWatchItems() {
${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}
+ ${renderSourceList(item.id)}
+
`
)
.join("");
}
+function renderSourceList(watchItemId) {
+ const sources = state.watchSources.filter((source) => source.watch_item_id === watchItemId);
+ if (!sources.length) {
+ return 'Noch keine direkten Quellen hinterlegt.
';
+ }
+
+ return `
+
+ ${sources
+ .map(
+ (source) => `
+
+
+
${escapeHtml(source.label || "Quelle")}
+
+ ${escapeHtml(source.url)}
+
+
+ ${escapeHtml(source.last_status)}
+ ${escapeHtml(source.last_message || "Noch nicht gescannt.")}
+
+
+
+
+
+
+
+ `
+ )
+ .join("")}
+
+ `;
+}
+
function getWatchNameById(id) {
return state.watchItems.find((item) => item.id === id)?.name || `Watch #${id}`;
}
@@ -270,6 +336,13 @@ function renderEvents() {
>
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
+
@@ -328,17 +401,19 @@ function updateSyncStatus(message) {
}
async function loadData() {
- const [watchItems, events, notifications, providerStatuses] = await Promise.all([
+ const [watchItems, events, notifications, providerStatuses, watchSources] = await Promise.all([
apiFetch("/watch-items"),
apiFetch("/events"),
apiFetch("/notifications"),
apiFetch("/provider-statuses"),
+ apiFetch("/watch-sources"),
]);
state.watchItems = watchItems;
state.events = events;
state.notifications = notifications;
state.providerStatuses = providerStatuses;
+ state.watchSources = watchSources;
renderStats();
renderWatchItems();
@@ -433,7 +508,60 @@ document.addEventListener("click", async (event) => {
});
await loadData();
showToast("Ticketstatus aktualisiert.");
+ return;
}
+
+ if (action === "delete-event") {
+ await apiFetch(`/events/${id}`, { method: "DELETE" });
+ await loadData();
+ showToast("Event geloescht.");
+ return;
+ }
+
+ if (action === "delete-source") {
+ await apiFetch(`/watch-sources/${id}`, { method: "DELETE" });
+ await loadData();
+ showToast("Quelle geloescht.");
+ return;
+ }
+
+ if (action === "toggle-source") {
+ const source = state.watchSources.find((entry) => entry.id === Number(id));
+ await apiFetch(`/watch-sources/${id}`, {
+ method: "PATCH",
+ body: JSON.stringify({ is_active: !source.is_active }),
+ });
+ await loadData();
+ showToast("Quellenstatus aktualisiert.");
+ }
+ } catch (error) {
+ showToast(error.message);
+ }
+});
+
+document.addEventListener("submit", async (event) => {
+ const form = event.target.closest(".source-form");
+ if (!form) {
+ return;
+ }
+
+ event.preventDefault();
+ const watchId = form.dataset.watchId;
+ const formData = new FormData(form);
+ const payload = {
+ label: formData.get("label")?.toString().trim() || null,
+ url: formData.get("url")?.toString().trim(),
+ parser_type: "auto",
+ };
+
+ try {
+ await apiFetch(`/watch-items/${watchId}/sources`, {
+ method: "POST",
+ body: JSON.stringify(payload),
+ });
+ form.reset();
+ await loadData();
+ showToast("Quelle hinzugefuegt.");
} catch (error) {
showToast(error.message);
}
diff --git a/backend/app/frontend/static/styles.css b/backend/app/frontend/static/styles.css
index 86833a2..7420e53 100644
--- a/backend/app/frontend/static/styles.css
+++ b/backend/app/frontend/static/styles.css
@@ -381,6 +381,37 @@ button:hover,
line-height: 1.55;
}
+.source-list {
+ display: grid;
+ gap: 10px;
+ margin-top: 16px;
+}
+
+.source-row {
+ display: flex;
+ align-items: start;
+ justify-content: space-between;
+ gap: 16px;
+ padding: 14px;
+ border-radius: var(--radius-sm);
+ background: rgba(46, 39, 30, 0.05);
+}
+
+.source-row a {
+ display: block;
+ max-width: 46ch;
+ margin: 4px 0 8px;
+ overflow-wrap: anywhere;
+ color: var(--primary-dark);
+}
+
+.source-form {
+ display: grid;
+ grid-template-columns: 0.7fr 1.4fr auto;
+ gap: 10px;
+ margin-top: 14px;
+}
+
.action-button {
min-height: 38px;
padding: 0 14px;
@@ -467,6 +498,7 @@ button:hover,
}
.watch-form,
+ .source-form,
.status-panel {
grid-template-columns: 1fr;
}
diff --git a/backend/app/main.py b/backend/app/main.py
index ade1aab..6edba14 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -8,7 +8,7 @@ from sqlalchemy.orm import Session
from app.config import settings
from app.database import Base, engine, get_db
-from app.models import TrackedEvent, WatchItem
+from app.models import TrackedEvent, WatchItem, WatchSource
from app.scheduler import start_scheduler
from app.schemas import (
NotificationLogRead,
@@ -19,11 +19,15 @@ from app.schemas import (
WatchItemCreate,
WatchItemRead,
WatchItemUpdate,
+ WatchSourceCreate,
+ WatchSourceRead,
+ WatchSourceUpdate,
)
from app.services import (
list_events,
list_notifications,
list_provider_statuses,
+ list_watch_sources,
list_watch_items,
run_sync,
)
@@ -114,6 +118,66 @@ def delete_watch_item(watch_item_id: int, db: Session = Depends(get_db)):
db.commit()
+@app.get("/watch-sources", response_model=list[WatchSourceRead])
+def get_watch_sources(watch_item_id: int | None = None, db: Session = Depends(get_db)):
+ return list_watch_sources(db, watch_item_id)
+
+
+@app.post(
+ "/watch-items/{watch_item_id}/sources",
+ response_model=WatchSourceRead,
+ status_code=201,
+)
+def create_watch_source(
+ watch_item_id: int,
+ payload: WatchSourceCreate,
+ db: Session = Depends(get_db),
+):
+ watch_item = db.get(WatchItem, watch_item_id)
+ if watch_item is None:
+ raise HTTPException(status_code=404, detail="Watch item nicht gefunden.")
+
+ source = WatchSource(
+ watch_item=watch_item,
+ label=payload.label,
+ url=payload.url,
+ parser_type=payload.parser_type,
+ )
+ db.add(source)
+ db.commit()
+ db.refresh(source)
+ return source
+
+
+@app.patch("/watch-sources/{source_id}", response_model=WatchSourceRead)
+def update_watch_source(
+ source_id: int,
+ payload: WatchSourceUpdate,
+ db: Session = Depends(get_db),
+):
+ source = db.get(WatchSource, source_id)
+ if source is None:
+ raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
+
+ updates = payload.model_dump(exclude_unset=True)
+ for field_name, value in updates.items():
+ setattr(source, field_name, value)
+ source.updated_at = datetime.utcnow()
+ db.commit()
+ db.refresh(source)
+ return source
+
+
+@app.delete("/watch-sources/{source_id}", status_code=204)
+def delete_watch_source(source_id: int, db: Session = Depends(get_db)):
+ source = db.get(WatchSource, source_id)
+ if source is None:
+ raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
+
+ db.delete(source)
+ db.commit()
+
+
@app.get("/events", response_model=list[TrackedEventRead])
def get_events(db: Session = Depends(get_db)):
return list_events(db)
@@ -137,6 +201,16 @@ def update_purchase_status(
return tracked_event
+@app.delete("/events/{event_id}", status_code=204)
+def delete_event(event_id: int, db: Session = Depends(get_db)):
+ tracked_event = db.get(TrackedEvent, event_id)
+ if tracked_event is None:
+ raise HTTPException(status_code=404, detail="Event nicht gefunden.")
+
+ db.delete(tracked_event)
+ db.commit()
+
+
@app.get("/notifications", response_model=list[NotificationLogRead])
def get_notifications(db: Session = Depends(get_db)):
return list_notifications(db)
diff --git a/backend/app/models.py b/backend/app/models.py
index 03a7bcc..6cb84c8 100644
--- a/backend/app/models.py
+++ b/backend/app/models.py
@@ -35,6 +35,13 @@ class ProviderStatusType(str, Enum):
error = "error"
+class SourceStatusType(str, Enum):
+ pending = "pending"
+ ok = "ok"
+ no_match = "no_match"
+ error = "error"
+
+
class WatchItem(Base):
__tablename__ = "watch_items"
@@ -57,6 +64,36 @@ class WatchItem(Base):
tracked_events: Mapped[list["TrackedEvent"]] = relationship(
back_populates="watch_item", cascade="all, delete-orphan"
)
+ sources: Mapped[list["WatchSource"]] = relationship(
+ back_populates="watch_item", cascade="all, delete-orphan"
+ )
+
+
+class WatchSource(Base):
+ __tablename__ = "watch_sources"
+
+ id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
+ watch_item_id: Mapped[int] = mapped_column(ForeignKey("watch_items.id"), nullable=False)
+ label: Mapped[str | None] = mapped_column(String(255), nullable=True)
+ url: Mapped[str] = mapped_column(String(1024), nullable=False)
+ parser_type: Mapped[str] = mapped_column(String(50), default="auto", nullable=False)
+ is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
+ last_status: Mapped[SourceStatusType] = mapped_column(
+ SqlEnum(SourceStatusType), default=SourceStatusType.pending, nullable=False
+ )
+ last_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+ last_checked_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
+ created_at: Mapped[datetime] = mapped_column(
+ DateTime, default=datetime.utcnow, nullable=False
+ )
+ updated_at: Mapped[datetime] = mapped_column(
+ DateTime,
+ default=datetime.utcnow,
+ onupdate=datetime.utcnow,
+ nullable=False,
+ )
+
+ watch_item: Mapped[WatchItem] = relationship(back_populates="sources")
class TrackedEvent(Base):
diff --git a/backend/app/providers/barclays_arena.py b/backend/app/providers/barclays_arena.py
index bcdf841..3292477 100644
--- a/backend/app/providers/barclays_arena.py
+++ b/backend/app/providers/barclays_arena.py
@@ -1,4 +1,5 @@
from datetime import datetime
+import re
from urllib.parse import urljoin
import requests
@@ -10,7 +11,7 @@ from app.providers.utils import normalize_search_text
class BarclaysArenaProvider:
source_name = "barclays_arena"
- events_url = "https://www.barclays-arena.de/events"
+ events_url = "https://www.barclays-arena.de/events/search"
def search_events(
self,
@@ -29,73 +30,104 @@ class BarclaysArenaProvider:
normalized_term = normalize_search_text(term)
results: list[dict] = []
- headings = soup.find_all("h3")
- for heading in headings:
+ for heading in soup.find_all("h3"):
title = heading.get_text(" ", strip=True)
if not title:
continue
- subtitle = ""
- subtitle_el = heading.find_next("h4")
- if subtitle_el:
- subtitle = subtitle_el.get_text(" ", strip=True)
+ subtitle_el = heading.find_next_sibling("h4")
+ subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else ""
+ # Keep matching local to the actual heading/subtitle pair. Wider
+ # parent containers often contain several event cards.
haystack = normalize_search_text(f"{title} {subtitle}")
if normalized_term not in haystack:
continue
- date_text = self._find_previous_date_text(heading)
- event_date = self._parse_german_date(date_text)
-
- link = heading.find_previous("a", href=True)
- if link is None:
+ detail_link = self._find_card_link(heading)
+ if detail_link is None:
continue
+ date_text = self._find_card_date_text(heading)
+ event_date = self._parse_german_date(date_text)
+ href = detail_link["href"]
+
results.append(
{
- "external_id": link["href"],
+ "external_id": href,
"title": title,
"matched_term": term,
"venue_name": "Barclays Arena",
"city": "Hamburg",
"country_code": "DE",
"event_date": event_date,
- "ticket_url": urljoin(self.events_url, link["href"]),
+ "ticket_url": urljoin(self.events_url, href),
"image_url": None,
"raw_payload": {
"title": title,
"subtitle": subtitle,
"date_text": date_text,
- "href": link["href"],
+ "href": href,
},
}
)
+ unique_results: dict[str, dict] = {}
+ for result in results:
+ unique_results[result["external_id"]] = result
+
self.last_status = "ok"
self.last_message = (
- f"Barclays Arena returned {len(results)} matched events for term '{term}'."
+ f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'."
)
- return results
+ return list(unique_results.values())
- def _find_previous_date_text(self, heading) -> str | None:
- current = heading.previous_sibling
- while current is not None:
+ def _find_card_link(self, heading):
+ link = heading.find_parent("a", href=re.compile(r"/events/"))
+ if link is not None:
+ return link
+
+ current = heading
+ for _ in range(5):
+ current = current.parent
+ if current is None:
+ return None
+ link = current.find("a", href=re.compile(r"/events/"))
+ if link is not None and heading in link.find_all("h3"):
+ return link
+ return None
+
+ def _find_card_date_text(self, heading) -> str | None:
+ current = heading
+ for _ in range(6):
+ current = current.previous_element
+ if current is None:
+ return None
text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
" ", strip=True
)
- if text and "|" in text:
- return text
- current = current.previous_sibling
+ date_text = self._extract_date_text(text)
+ if date_text:
+ return date_text
+ return None
+
+ def _extract_date_text(self, text: str) -> str | None:
+ match = re.search(
+ r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})",
+ text,
+ )
+ if match:
+ return match.group(2)
+
+ match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text)
+ if match:
+ return match.group(1)
return None
def _parse_german_date(self, value: str | None) -> datetime | None:
if not value:
return None
- parts = [part.strip() for part in value.split("|")]
- if len(parts) < 2:
- return None
try:
- return datetime.strptime(parts[1], "%d.%m.%Y")
+ return datetime.strptime(value, "%d.%m.%Y")
except ValueError:
return None
-
diff --git a/backend/app/schemas.py b/backend/app/schemas.py
index 611efb4..a96c167 100644
--- a/backend/app/schemas.py
+++ b/backend/app/schemas.py
@@ -7,6 +7,7 @@ from app.models import (
NotificationType,
ProviderStatusType,
RegionScope,
+ SourceStatusType,
WatchType,
)
@@ -39,6 +40,35 @@ class WatchItemRead(BaseModel):
updated_at: datetime
+class WatchSourceCreate(BaseModel):
+ label: str | None = Field(default=None, max_length=255)
+ url: str = Field(min_length=8, max_length=1024)
+ parser_type: str = "auto"
+
+
+class WatchSourceUpdate(BaseModel):
+ label: str | None = Field(default=None, max_length=255)
+ url: str | None = Field(default=None, min_length=8, max_length=1024)
+ parser_type: str | None = None
+ is_active: bool | None = None
+
+
+class WatchSourceRead(BaseModel):
+ model_config = ConfigDict(from_attributes=True)
+
+ id: int
+ watch_item_id: int
+ label: str | None
+ url: str
+ parser_type: str
+ is_active: bool
+ last_status: SourceStatusType
+ last_message: str | None
+ last_checked_at: datetime | None
+ created_at: datetime
+ updated_at: datetime
+
+
class PurchaseUpdate(BaseModel):
is_ticket_purchased: bool
diff --git a/backend/app/services.py b/backend/app/services.py
index 85cc90b..a09ddca 100644
--- a/backend/app/services.py
+++ b/backend/app/services.py
@@ -10,12 +10,15 @@ from app.models import (
NotificationType,
ProviderStatus,
ProviderStatusType,
+ SourceStatusType,
TrackedEvent,
WatchItem,
+ WatchSource,
)
from app.notifications import send_email_notification
from app.providers.registry import get_providers
from app.schemas import SyncResult
+from app.source_scanner import SourceScanner
logger = logging.getLogger(__name__)
@@ -71,6 +74,13 @@ def list_provider_statuses(db: Session) -> list[ProviderStatus]:
return list(db.scalars(select(ProviderStatus).order_by(ProviderStatus.provider_name)))
+def list_watch_sources(db: Session, watch_item_id: int | None = None) -> list[WatchSource]:
+ stmt = select(WatchSource).order_by(WatchSource.created_at)
+ if watch_item_id is not None:
+ stmt = stmt.where(WatchSource.watch_item_id == watch_item_id)
+ return list(db.scalars(stmt))
+
+
def update_provider_status(
db: Session,
provider_name: str,
@@ -261,6 +271,7 @@ def upsert_event(
def run_sync(db: Session) -> SyncResult:
providers = get_providers()
+ source_scanner = SourceScanner()
provider_states = {
provider.source_name: init_provider_sync_state(provider.source_name)
for provider in providers
@@ -275,6 +286,75 @@ def run_sync(db: Session) -> SyncResult:
notifications_skipped = 0
for watch_item in active_items:
+ active_sources = [source for source in watch_item.sources if source.is_active]
+ for source in active_sources:
+ try:
+ events = source_scanner.scan(watch_item, source)
+ source.last_status = (
+ SourceStatusType.ok if events else SourceStatusType.no_match
+ )
+ source.last_message = (
+ f"{len(events)} passende Events gefunden."
+ if events
+ else "Keine passenden Events auf dieser Quelle gefunden."
+ )
+ source.last_checked_at = datetime.utcnow()
+ except Exception as exc:
+ logger.exception(
+ "Source scan failed for watch_item=%s source=%s",
+ watch_item.name,
+ source.url,
+ )
+ db.rollback()
+ source.last_status = SourceStatusType.error
+ source.last_message = f"Scan fehlgeschlagen: {exc}"
+ source.last_checked_at = datetime.utcnow()
+ db.add(source)
+ db.commit()
+ continue
+
+ for event_data in events:
+ tracked_event, is_new = upsert_event(
+ db=db,
+ watch_item=watch_item,
+ provider_name=f"source:{source.id}",
+ event_data=event_data,
+ )
+ if is_new:
+ new_events += 1
+ else:
+ updated_events += 1
+
+ should_notify = (
+ is_new
+ and tracked_event.discovery_notified_at is None
+ and not has_equivalent_existing_event(db, tracked_event)
+ )
+ if should_notify:
+ status = send_email_notification(
+ db=db,
+ tracked_event=tracked_event,
+ notification_type=NotificationType.discovery,
+ subject=f"Neuer Termin fuer {watch_item.name}",
+ body=(
+ f"Es wurde ein neuer Termin fuer '{watch_item.name}' gefunden.\n\n"
+ f"Quelle: {source.label or source.url}\n"
+ f"Titel: {tracked_event.title}\n"
+ f"Ort: {tracked_event.venue_name or 'unbekannt'}\n"
+ f"Stadt: {tracked_event.city or 'unbekannt'}\n"
+ f"Datum: {tracked_event.event_date or 'unbekannt'}\n"
+ f"Tickets: {tracked_event.ticket_url or 'keine URL'}\n"
+ ),
+ )
+ if status == NotificationStatus.sent:
+ tracked_event.discovery_notified_at = datetime.utcnow()
+ notifications_sent += 1
+ else:
+ notifications_skipped += 1
+
+ db.add(source)
+ db.commit()
+
for provider in providers:
try:
events = provider.search_events(
diff --git a/backend/app/source_scanner.py b/backend/app/source_scanner.py
new file mode 100644
index 0000000..4fec107
--- /dev/null
+++ b/backend/app/source_scanner.py
@@ -0,0 +1,446 @@
+import json
+import re
+from datetime import datetime
+from html import unescape
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+
+from app.models import RegionScope, WatchItem, WatchSource, WatchType
+from app.providers.utils import normalize_search_text
+
+
+MONTH_ALIASES = {
+ "jan": 1,
+ "januar": 1,
+ "feb": 2,
+ "februar": 2,
+ "maer": 3,
+ "maerz": 3,
+ "mar": 3,
+ "maerz": 3,
+ "apr": 4,
+ "april": 4,
+ "mai": 5,
+ "jun": 6,
+ "juni": 6,
+ "jul": 7,
+ "juli": 7,
+ "aug": 8,
+ "august": 8,
+ "sep": 9,
+ "sept": 9,
+ "september": 9,
+ "okt": 10,
+ "oktober": 10,
+ "nov": 11,
+ "november": 11,
+ "dez": 12,
+ "dezember": 12,
+}
+
+
+class SourceScanner:
+ headers = {
+ "User-Agent": "eventlens/0.1 (+https://local)",
+ "Accept": "text/html,application/xhtml+xml,application/json",
+ "Accept-Language": "de-DE,de;q=0.9,en;q=0.7",
+ }
+
+ def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]:
+ response = requests.get(
+ source.url,
+ headers=self.headers,
+ timeout=30,
+ )
+ response.raise_for_status()
+
+ content_type = response.headers.get("content-type", "")
+ if "application/json" in content_type:
+ return self._scan_json(watch_item, source, response.json())
+
+ return self._scan_html(watch_item, source, response.text)
+
+ def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]:
+ events = self._extract_jsonld_events(payload)
+ return self._events_from_jsonld(watch_item, source, events)
+
+ def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]:
+ soup = BeautifulSoup(html, "html.parser")
+ jsonld_events = []
+
+ for script in soup.find_all("script", type="application/ld+json"):
+ raw_payload = script.string or script.get_text()
+ if not raw_payload:
+ continue
+ try:
+ payload = json.loads(unescape(raw_payload))
+ except json.JSONDecodeError:
+ continue
+ jsonld_events.extend(self._extract_jsonld_events(payload))
+
+ jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
+ if jsonld_results:
+ return jsonld_results
+
+ return self._events_from_html_text(watch_item, source, soup)
+
+ def _extract_jsonld_events(self, payload) -> list[dict]:
+ events: list[dict] = []
+ if isinstance(payload, list):
+ for item in payload:
+ events.extend(self._extract_jsonld_events(item))
+ return events
+
+ if not isinstance(payload, dict):
+ return events
+
+ graph = payload.get("@graph")
+ if isinstance(graph, list):
+ for item in graph:
+ events.extend(self._extract_jsonld_events(item))
+
+ item_type = payload.get("@type")
+ if isinstance(item_type, list):
+ is_event = "Event" in item_type
+ else:
+ is_event = item_type == "Event"
+ if is_event:
+ events.append(payload)
+
+ return events
+
+ def _events_from_jsonld(
+ self,
+ watch_item: WatchItem,
+ source: WatchSource,
+ events: list[dict],
+ ) -> list[dict]:
+ results: list[dict] = []
+ normalized_term = normalize_search_text(watch_item.name)
+
+ for event in events:
+ title = event.get("name") or ""
+ performers = self._extract_performer_names(event)
+ haystack = normalize_search_text(" ".join([title] + performers))
+ if normalized_term not in haystack:
+ continue
+
+ location = event.get("location") or {}
+ address = location.get("address") or {}
+ city = address.get("addressLocality") or location.get("name")
+ if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
+ continue
+
+ event_date = self._parse_datetime(event.get("startDate"))
+ if event_date and event_date.date() < datetime.utcnow().date():
+ continue
+ ticket_url = event.get("url") or source.url
+
+ results.append(
+ {
+ "external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
+ "title": title or watch_item.name,
+ "matched_term": watch_item.name,
+ "venue_name": location.get("name") or source.label,
+ "city": city,
+ "country_code": "DE",
+ "event_date": event_date,
+ "ticket_url": ticket_url,
+ "image_url": self._extract_image(event),
+ "raw_payload": event,
+ }
+ )
+
+ return results
+
+ def _events_from_html_text(
+ self,
+ watch_item: WatchItem,
+ source: WatchSource,
+ soup: BeautifulSoup,
+ ) -> list[dict]:
+ text = soup.get_text(" ", strip=True)
+ normalized_text = normalize_search_text(text)
+ normalized_term = normalize_search_text(watch_item.name)
+ if normalized_term not in normalized_text:
+ return []
+
+ results: list[dict] = []
+ seen_keys: set[str] = set()
+ for context in self._find_matching_contexts(soup, watch_item):
+ context_text = context.get_text(" ", strip=True)
+ event_date = self._find_nearest_date(context_text, watch_item.name)
+ if event_date is None:
+ continue
+ if event_date.date() < datetime.utcnow().date():
+ continue
+ if (
+ watch_item.region_scope == RegionScope.hamburg
+ and "hamburg" not in normalize_search_text(context_text)
+ ):
+ continue
+
+ title = self._find_title(context, watch_item.name)
+ link = self._find_nearest_link(context, watch_item.name, source.url) or source.url
+ key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}"
+ if key in seen_keys:
+ continue
+ seen_keys.add(key)
+
+ results.append(
+ {
+ "external_id": key,
+ "title": title,
+ "matched_term": watch_item.name,
+ "venue_name": self._find_venue(context_text, source.label),
+ "city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
+ "country_code": "DE",
+ "event_date": event_date,
+ "ticket_url": link,
+ "image_url": None,
+ "raw_payload": {
+ "source_url": source.url,
+ "parser": "html_text",
+ "context": context_text[:1000],
+ },
+ }
+ )
+
+ return results
+
+ def _extract_performer_names(self, event: dict) -> list[str]:
+ performer = event.get("performer") or event.get("performers")
+ if isinstance(performer, dict):
+ return [performer.get("name", "")]
+ if isinstance(performer, list):
+ return [item.get("name", "") for item in performer if isinstance(item, dict)]
+ return []
+
+ def _extract_image(self, event: dict) -> str | None:
+ image = event.get("image")
+ if isinstance(image, str):
+ return image
+ if isinstance(image, list):
+ for item in image:
+ if isinstance(item, str):
+ return item
+ return None
+
+ def _parse_datetime(self, value: str | None) -> datetime | None:
+ if not value:
+ return None
+ try:
+ return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)
+ except ValueError:
+ pass
+ for fmt in ("%d.%m.%Y", "%Y-%m-%d"):
+ try:
+ return datetime.strptime(value[:10], fmt)
+ except ValueError:
+ continue
+ return None
+
+ def _find_nearest_date(self, text: str, term: str) -> datetime | None:
+ normalized_term = normalize_search_text(term)
+ normalized_text = normalize_search_text(text)
+ term_index = normalized_text.find(normalized_term)
+ search_area = text
+ if term_index >= 0:
+ start = max(0, term_index - 300)
+ end = min(len(text), term_index + 500)
+ search_area = text[start:end]
+
+ candidates: list[datetime] = []
+ for pattern in (
+ r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
+ r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b",
+ r"\b(\d{1,2}\.\d{1,2}\.)\b",
+ ):
+ for match in re.finditer(pattern, search_area):
+ parsed = self._parse_german_date(match.group(1))
+ if parsed:
+ candidates.append(parsed)
+
+ month_name_pattern = (
+ r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|รค)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|"
+ r"jul(?:i)?|aug(?:ust)?|sep(?:t|tember)?|okt(?:ober)?|nov(?:ember)?|dez(?:ember)?"
+ )
+ for match in re.finditer(
+ rf"\b(\d{{1,2}})\.?\s+({month_name_pattern})\.?\s*(\d{{4}})?\b",
+ search_area,
+ re.IGNORECASE,
+ ):
+ parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3))
+ if parsed:
+ candidates.append(parsed)
+ for match in re.finditer(
+ rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b",
+ search_area,
+ re.IGNORECASE,
+ ):
+ parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3))
+ if parsed:
+ candidates.append(parsed)
+
+ future_candidates = [
+ candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date()
+ ]
+ if future_candidates:
+ return sorted(future_candidates)[0]
+ return sorted(candidates)[0] if candidates else None
+
+ def _parse_german_date(self, value: str) -> datetime | None:
+ cleaned = value.strip()
+ current_year = datetime.utcnow().year
+ candidates = [cleaned]
+ if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
+ candidates.append(f"{cleaned}{current_year}")
+ candidates.append(f"{cleaned}{current_year + 1}")
+ elif re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}", cleaned):
+ day, month, year = cleaned.split(".")
+ candidates.append(f"{day}.{month}.20{year}")
+
+ for candidate in candidates:
+ try:
+ parsed = datetime.strptime(candidate, "%d.%m.%Y")
+ if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
+ continue
+ return parsed
+ except ValueError:
+ continue
+ return None
+
+ def _parse_named_month_date(
+ self,
+ day_value: str,
+ month_value: str,
+ year_value: str | None,
+ ) -> datetime | None:
+ month = MONTH_ALIASES.get(normalize_search_text(month_value).rstrip("."))
+ if month is None:
+ return None
+
+ day = int(day_value)
+ current_year = datetime.utcnow().year
+ years = [int(year_value)] if year_value else [current_year, current_year + 1]
+ for year in years:
+ try:
+ parsed = datetime(year, month, day)
+ except ValueError:
+ continue
+ if year_value or parsed.date() >= datetime.utcnow().date():
+ return parsed
+ return None
+
+ def _find_matching_contexts(self, soup: BeautifulSoup, watch_item: WatchItem) -> list:
+ normalized_term = normalize_search_text(watch_item.name)
+ selectors = [
+ "li.card",
+ ".tourplan .row",
+ "[class*=event]",
+ "[class*=termin]",
+ "article",
+ "tr",
+ "li",
+ ".row",
+ ]
+ candidates = []
+ seen_nodes = set()
+
+ for selector in selectors:
+ for node in soup.select(selector):
+ if id(node) in seen_nodes:
+ continue
+ seen_nodes.add(id(node))
+ text = node.get_text(" ", strip=True)
+ if normalized_term not in normalize_search_text(text):
+ continue
+ if len(text) > 3500:
+ continue
+ if self._find_nearest_date(text, watch_item.name):
+ candidates.append(node)
+
+ if candidates:
+ return candidates
+
+ fallback = self._find_best_context(soup, watch_item.name)
+ return [fallback] if fallback is not None else []
+
+ def _find_venue(self, text: str, default: str) -> str:
+ lines = [line.strip() for line in re.split(r"\s{2,}|\n|\r", text) if line.strip()]
+ for line in lines:
+ normalized = normalize_search_text(line)
+ if "hamburg" in normalized and len(line) <= 120:
+ return line
+ return default
+
+ def _find_best_context(self, soup: BeautifulSoup, term: str):
+ normalized_term = normalize_search_text(term)
+ candidates = []
+ for node in soup.find_all(string=True):
+ if normalized_term in normalize_search_text(str(node)):
+ parent = node.parent
+ if parent is None:
+ continue
+ best_parent = self._climb_to_context_with_date(parent, term)
+ text = best_parent.get_text(" ", strip=True)
+ candidates.append(
+ (
+ 0 if self._find_nearest_date(text, term) else 1,
+ len(text),
+ best_parent,
+ )
+ )
+
+ if not candidates:
+ return None
+
+ candidates.sort(key=lambda item: (item[0], item[1]))
+ return candidates[0][2]
+
+ def _climb_to_context_with_date(self, node, term: str):
+ current = node
+ best = node
+ for _ in range(6):
+ if current is None:
+ break
+ context_text = current.get_text(" ", strip=True)
+ if self._find_nearest_date(context_text, term):
+ return current
+ best = current
+ current = current.parent
+ return best
+
+ def _find_title(self, soup: BeautifulSoup, term: str) -> str:
+ if soup is None:
+ return term
+ normalized_term = normalize_search_text(term)
+ for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
+ title = heading.get_text(" ", strip=True)
+ if normalized_term in normalize_search_text(title):
+ return title
+
+ text = soup.get_text(" ", strip=True)
+ dated_match = re.search(
+ r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
+ + re.escape(term)
+ + r".{0,100})",
+ text,
+ re.IGNORECASE,
+ )
+ if dated_match:
+ return " ".join(dated_match.group(1).split())
+
+ match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
+ if match:
+ return " ".join(match.group(1).split())
+ return term
+
+ def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
+ normalized_term = normalize_search_text(term)
+ for link in soup.find_all("a", href=True):
+ if normalized_term in normalize_search_text(link.get_text(" ", strip=True)):
+ return urljoin(base_url, link["href"])
+ return None