auf Webseiten Modell umgestellt

This commit is contained in:
ecki
2026-04-18 14:23:24 +02:00
parent 5510d58e5a
commit 6cfbdba0a4
9 changed files with 890 additions and 30 deletions
+1
View File
@@ -114,3 +114,4 @@ sudo docker compose up -d --build
- Bandsintown benoetigt eine echte, von Bandsintown freigeschaltete App-ID. Ohne diese wird der Provider deaktiviert oder als `blocked` angezeigt.
- Barclays Arena wird ueber die offizielle Eventseite der Arena abgefragt.
- Fabrik wird ueber die offizielle Veranstaltungsseite der Fabrik Hamburg abgefragt.
- Fuer robuste persoenliche Ueberwachung koennen pro Watchlist-Eintrag direkte Quellen-URLs hinterlegt werden. Diese werden beim Sync gezielt per JSON-LD und HTML-Textscan durchsucht.
+129 -1
View File
@@ -3,6 +3,7 @@ const state = {
events: [],
notifications: [],
providerStatuses: [],
watchSources: [],
};
const watchItemsEl = document.querySelector("#watch-items");
@@ -92,6 +93,12 @@ function renderStats() {
}
function prettifyProviderName(value) {
if (value?.startsWith("source:")) {
const sourceId = Number(value.split(":")[1]);
const source = state.watchSources.find((entry) => entry.id === sourceId);
return source?.label || "Direkte Quelle";
}
const names = {
ticketmaster: "Ticketmaster",
bandsintown: "Bandsintown",
@@ -206,12 +213,71 @@ function renderWatchItems() {
</div>
</div>
<p>${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}</p>
${renderSourceList(item.id)}
<form class="source-form" data-watch-id="${item.id}">
<input
name="label"
type="text"
placeholder="Quelle, z. B. Kuenstlerseite"
/>
<input
name="url"
type="url"
placeholder="https://..."
required
/>
<button type="submit" class="action-button success">Quelle hinzufuegen</button>
</form>
</article>
`
)
.join("");
}
function renderSourceList(watchItemId) {
const sources = state.watchSources.filter((source) => source.watch_item_id === watchItemId);
if (!sources.length) {
return '<div class="source-list muted">Noch keine direkten Quellen hinterlegt.</div>';
}
return `
<div class="source-list">
${sources
.map(
(source) => `
<div class="source-row">
<div>
<strong>${escapeHtml(source.label || "Quelle")}</strong>
<a href="${escapeHtml(source.url)}" target="_blank" rel="noreferrer">
${escapeHtml(source.url)}
</a>
<div class="pill-row">
<span class="pill ${
source.last_status === "ok"
? "success"
: source.last_status === "error"
? "danger"
: "warning"
}">${escapeHtml(source.last_status)}</span>
<span class="muted">${escapeHtml(source.last_message || "Noch nicht gescannt.")}</span>
</div>
</div>
<div class="action-row">
<button class="action-button" data-action="toggle-source" data-id="${source.id}">
${source.is_active ? "Pausieren" : "Aktivieren"}
</button>
<button class="action-button danger" data-action="delete-source" data-id="${source.id}">
Loeschen
</button>
</div>
</div>
`
)
.join("")}
</div>
`;
}
function getWatchNameById(id) {
return state.watchItems.find((item) => item.id === id)?.name || `Watch #${id}`;
}
@@ -270,6 +336,13 @@ function renderEvents() {
>
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
</button>
<button
class="action-button danger"
data-action="delete-event"
data-id="${event.id}"
>
Loeschen
</button>
</div>
</div>
<div class="event-meta">
@@ -328,17 +401,19 @@ function updateSyncStatus(message) {
}
async function loadData() {
const [watchItems, events, notifications, providerStatuses] = await Promise.all([
const [watchItems, events, notifications, providerStatuses, watchSources] = await Promise.all([
apiFetch("/watch-items"),
apiFetch("/events"),
apiFetch("/notifications"),
apiFetch("/provider-statuses"),
apiFetch("/watch-sources"),
]);
state.watchItems = watchItems;
state.events = events;
state.notifications = notifications;
state.providerStatuses = providerStatuses;
state.watchSources = watchSources;
renderStats();
renderWatchItems();
@@ -433,7 +508,60 @@ document.addEventListener("click", async (event) => {
});
await loadData();
showToast("Ticketstatus aktualisiert.");
return;
}
if (action === "delete-event") {
await apiFetch(`/events/${id}`, { method: "DELETE" });
await loadData();
showToast("Event geloescht.");
return;
}
if (action === "delete-source") {
await apiFetch(`/watch-sources/${id}`, { method: "DELETE" });
await loadData();
showToast("Quelle geloescht.");
return;
}
if (action === "toggle-source") {
const source = state.watchSources.find((entry) => entry.id === Number(id));
await apiFetch(`/watch-sources/${id}`, {
method: "PATCH",
body: JSON.stringify({ is_active: !source.is_active }),
});
await loadData();
showToast("Quellenstatus aktualisiert.");
}
} catch (error) {
showToast(error.message);
}
});
document.addEventListener("submit", async (event) => {
const form = event.target.closest(".source-form");
if (!form) {
return;
}
event.preventDefault();
const watchId = form.dataset.watchId;
const formData = new FormData(form);
const payload = {
label: formData.get("label")?.toString().trim() || null,
url: formData.get("url")?.toString().trim(),
parser_type: "auto",
};
try {
await apiFetch(`/watch-items/${watchId}/sources`, {
method: "POST",
body: JSON.stringify(payload),
});
form.reset();
await loadData();
showToast("Quelle hinzugefuegt.");
} catch (error) {
showToast(error.message);
}
+32
View File
@@ -381,6 +381,37 @@ button:hover,
line-height: 1.55;
}
.source-list {
display: grid;
gap: 10px;
margin-top: 16px;
}
.source-row {
display: flex;
align-items: start;
justify-content: space-between;
gap: 16px;
padding: 14px;
border-radius: var(--radius-sm);
background: rgba(46, 39, 30, 0.05);
}
.source-row a {
display: block;
max-width: 46ch;
margin: 4px 0 8px;
overflow-wrap: anywhere;
color: var(--primary-dark);
}
.source-form {
display: grid;
grid-template-columns: 0.7fr 1.4fr auto;
gap: 10px;
margin-top: 14px;
}
.action-button {
min-height: 38px;
padding: 0 14px;
@@ -467,6 +498,7 @@ button:hover,
}
.watch-form,
.source-form,
.status-panel {
grid-template-columns: 1fr;
}
+75 -1
View File
@@ -8,7 +8,7 @@ from sqlalchemy.orm import Session
from app.config import settings
from app.database import Base, engine, get_db
from app.models import TrackedEvent, WatchItem
from app.models import TrackedEvent, WatchItem, WatchSource
from app.scheduler import start_scheduler
from app.schemas import (
NotificationLogRead,
@@ -19,11 +19,15 @@ from app.schemas import (
WatchItemCreate,
WatchItemRead,
WatchItemUpdate,
WatchSourceCreate,
WatchSourceRead,
WatchSourceUpdate,
)
from app.services import (
list_events,
list_notifications,
list_provider_statuses,
list_watch_sources,
list_watch_items,
run_sync,
)
@@ -114,6 +118,66 @@ def delete_watch_item(watch_item_id: int, db: Session = Depends(get_db)):
db.commit()
@app.get("/watch-sources", response_model=list[WatchSourceRead])
def get_watch_sources(watch_item_id: int | None = None, db: Session = Depends(get_db)):
return list_watch_sources(db, watch_item_id)
@app.post(
"/watch-items/{watch_item_id}/sources",
response_model=WatchSourceRead,
status_code=201,
)
def create_watch_source(
watch_item_id: int,
payload: WatchSourceCreate,
db: Session = Depends(get_db),
):
watch_item = db.get(WatchItem, watch_item_id)
if watch_item is None:
raise HTTPException(status_code=404, detail="Watch item nicht gefunden.")
source = WatchSource(
watch_item=watch_item,
label=payload.label,
url=payload.url,
parser_type=payload.parser_type,
)
db.add(source)
db.commit()
db.refresh(source)
return source
@app.patch("/watch-sources/{source_id}", response_model=WatchSourceRead)
def update_watch_source(
source_id: int,
payload: WatchSourceUpdate,
db: Session = Depends(get_db),
):
source = db.get(WatchSource, source_id)
if source is None:
raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
updates = payload.model_dump(exclude_unset=True)
for field_name, value in updates.items():
setattr(source, field_name, value)
source.updated_at = datetime.utcnow()
db.commit()
db.refresh(source)
return source
@app.delete("/watch-sources/{source_id}", status_code=204)
def delete_watch_source(source_id: int, db: Session = Depends(get_db)):
source = db.get(WatchSource, source_id)
if source is None:
raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
db.delete(source)
db.commit()
@app.get("/events", response_model=list[TrackedEventRead])
def get_events(db: Session = Depends(get_db)):
return list_events(db)
@@ -137,6 +201,16 @@ def update_purchase_status(
return tracked_event
@app.delete("/events/{event_id}", status_code=204)
def delete_event(event_id: int, db: Session = Depends(get_db)):
tracked_event = db.get(TrackedEvent, event_id)
if tracked_event is None:
raise HTTPException(status_code=404, detail="Event nicht gefunden.")
db.delete(tracked_event)
db.commit()
@app.get("/notifications", response_model=list[NotificationLogRead])
def get_notifications(db: Session = Depends(get_db)):
return list_notifications(db)
+37
View File
@@ -35,6 +35,13 @@ class ProviderStatusType(str, Enum):
error = "error"
class SourceStatusType(str, Enum):
pending = "pending"
ok = "ok"
no_match = "no_match"
error = "error"
class WatchItem(Base):
__tablename__ = "watch_items"
@@ -57,6 +64,36 @@ class WatchItem(Base):
tracked_events: Mapped[list["TrackedEvent"]] = relationship(
back_populates="watch_item", cascade="all, delete-orphan"
)
sources: Mapped[list["WatchSource"]] = relationship(
back_populates="watch_item", cascade="all, delete-orphan"
)
class WatchSource(Base):
__tablename__ = "watch_sources"
id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
watch_item_id: Mapped[int] = mapped_column(ForeignKey("watch_items.id"), nullable=False)
label: Mapped[str | None] = mapped_column(String(255), nullable=True)
url: Mapped[str] = mapped_column(String(1024), nullable=False)
parser_type: Mapped[str] = mapped_column(String(50), default="auto", nullable=False)
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
last_status: Mapped[SourceStatusType] = mapped_column(
SqlEnum(SourceStatusType), default=SourceStatusType.pending, nullable=False
)
last_message: Mapped[str | None] = mapped_column(Text, nullable=True)
last_checked_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)
updated_at: Mapped[datetime] = mapped_column(
DateTime,
default=datetime.utcnow,
onupdate=datetime.utcnow,
nullable=False,
)
watch_item: Mapped[WatchItem] = relationship(back_populates="sources")
class TrackedEvent(Base):
+60 -28
View File
@@ -1,4 +1,5 @@
from datetime import datetime
import re
from urllib.parse import urljoin
import requests
@@ -10,7 +11,7 @@ from app.providers.utils import normalize_search_text
class BarclaysArenaProvider:
source_name = "barclays_arena"
events_url = "https://www.barclays-arena.de/events"
events_url = "https://www.barclays-arena.de/events/search"
def search_events(
self,
@@ -29,73 +30,104 @@ class BarclaysArenaProvider:
normalized_term = normalize_search_text(term)
results: list[dict] = []
headings = soup.find_all("h3")
for heading in headings:
for heading in soup.find_all("h3"):
title = heading.get_text(" ", strip=True)
if not title:
continue
subtitle = ""
subtitle_el = heading.find_next("h4")
if subtitle_el:
subtitle = subtitle_el.get_text(" ", strip=True)
subtitle_el = heading.find_next_sibling("h4")
subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else ""
# Keep matching local to the actual heading/subtitle pair. Wider
# parent containers often contain several event cards.
haystack = normalize_search_text(f"{title} {subtitle}")
if normalized_term not in haystack:
continue
date_text = self._find_previous_date_text(heading)
event_date = self._parse_german_date(date_text)
link = heading.find_previous("a", href=True)
if link is None:
detail_link = self._find_card_link(heading)
if detail_link is None:
continue
date_text = self._find_card_date_text(heading)
event_date = self._parse_german_date(date_text)
href = detail_link["href"]
results.append(
{
"external_id": link["href"],
"external_id": href,
"title": title,
"matched_term": term,
"venue_name": "Barclays Arena",
"city": "Hamburg",
"country_code": "DE",
"event_date": event_date,
"ticket_url": urljoin(self.events_url, link["href"]),
"ticket_url": urljoin(self.events_url, href),
"image_url": None,
"raw_payload": {
"title": title,
"subtitle": subtitle,
"date_text": date_text,
"href": link["href"],
"href": href,
},
}
)
unique_results: dict[str, dict] = {}
for result in results:
unique_results[result["external_id"]] = result
self.last_status = "ok"
self.last_message = (
f"Barclays Arena returned {len(results)} matched events for term '{term}'."
f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'."
)
return results
return list(unique_results.values())
def _find_previous_date_text(self, heading) -> str | None:
current = heading.previous_sibling
while current is not None:
def _find_card_link(self, heading):
link = heading.find_parent("a", href=re.compile(r"/events/"))
if link is not None:
return link
current = heading
for _ in range(5):
current = current.parent
if current is None:
return None
link = current.find("a", href=re.compile(r"/events/"))
if link is not None and heading in link.find_all("h3"):
return link
return None
def _find_card_date_text(self, heading) -> str | None:
current = heading
for _ in range(6):
current = current.previous_element
if current is None:
return None
text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
" ", strip=True
)
if text and "|" in text:
return text
current = current.previous_sibling
date_text = self._extract_date_text(text)
if date_text:
return date_text
return None
def _extract_date_text(self, text: str) -> str | None:
match = re.search(
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})",
text,
)
if match:
return match.group(2)
match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text)
if match:
return match.group(1)
return None
def _parse_german_date(self, value: str | None) -> datetime | None:
if not value:
return None
parts = [part.strip() for part in value.split("|")]
if len(parts) < 2:
return None
try:
return datetime.strptime(parts[1], "%d.%m.%Y")
return datetime.strptime(value, "%d.%m.%Y")
except ValueError:
return None
+30
View File
@@ -7,6 +7,7 @@ from app.models import (
NotificationType,
ProviderStatusType,
RegionScope,
SourceStatusType,
WatchType,
)
@@ -39,6 +40,35 @@ class WatchItemRead(BaseModel):
updated_at: datetime
class WatchSourceCreate(BaseModel):
label: str | None = Field(default=None, max_length=255)
url: str = Field(min_length=8, max_length=1024)
parser_type: str = "auto"
class WatchSourceUpdate(BaseModel):
label: str | None = Field(default=None, max_length=255)
url: str | None = Field(default=None, min_length=8, max_length=1024)
parser_type: str | None = None
is_active: bool | None = None
class WatchSourceRead(BaseModel):
model_config = ConfigDict(from_attributes=True)
id: int
watch_item_id: int
label: str | None
url: str
parser_type: str
is_active: bool
last_status: SourceStatusType
last_message: str | None
last_checked_at: datetime | None
created_at: datetime
updated_at: datetime
class PurchaseUpdate(BaseModel):
is_ticket_purchased: bool
+80
View File
@@ -10,12 +10,15 @@ from app.models import (
NotificationType,
ProviderStatus,
ProviderStatusType,
SourceStatusType,
TrackedEvent,
WatchItem,
WatchSource,
)
from app.notifications import send_email_notification
from app.providers.registry import get_providers
from app.schemas import SyncResult
from app.source_scanner import SourceScanner
logger = logging.getLogger(__name__)
@@ -71,6 +74,13 @@ def list_provider_statuses(db: Session) -> list[ProviderStatus]:
return list(db.scalars(select(ProviderStatus).order_by(ProviderStatus.provider_name)))
def list_watch_sources(db: Session, watch_item_id: int | None = None) -> list[WatchSource]:
stmt = select(WatchSource).order_by(WatchSource.created_at)
if watch_item_id is not None:
stmt = stmt.where(WatchSource.watch_item_id == watch_item_id)
return list(db.scalars(stmt))
def update_provider_status(
db: Session,
provider_name: str,
@@ -261,6 +271,7 @@ def upsert_event(
def run_sync(db: Session) -> SyncResult:
providers = get_providers()
source_scanner = SourceScanner()
provider_states = {
provider.source_name: init_provider_sync_state(provider.source_name)
for provider in providers
@@ -275,6 +286,75 @@ def run_sync(db: Session) -> SyncResult:
notifications_skipped = 0
for watch_item in active_items:
active_sources = [source for source in watch_item.sources if source.is_active]
for source in active_sources:
try:
events = source_scanner.scan(watch_item, source)
source.last_status = (
SourceStatusType.ok if events else SourceStatusType.no_match
)
source.last_message = (
f"{len(events)} passende Events gefunden."
if events
else "Keine passenden Events auf dieser Quelle gefunden."
)
source.last_checked_at = datetime.utcnow()
except Exception as exc:
logger.exception(
"Source scan failed for watch_item=%s source=%s",
watch_item.name,
source.url,
)
db.rollback()
source.last_status = SourceStatusType.error
source.last_message = f"Scan fehlgeschlagen: {exc}"
source.last_checked_at = datetime.utcnow()
db.add(source)
db.commit()
continue
for event_data in events:
tracked_event, is_new = upsert_event(
db=db,
watch_item=watch_item,
provider_name=f"source:{source.id}",
event_data=event_data,
)
if is_new:
new_events += 1
else:
updated_events += 1
should_notify = (
is_new
and tracked_event.discovery_notified_at is None
and not has_equivalent_existing_event(db, tracked_event)
)
if should_notify:
status = send_email_notification(
db=db,
tracked_event=tracked_event,
notification_type=NotificationType.discovery,
subject=f"Neuer Termin fuer {watch_item.name}",
body=(
f"Es wurde ein neuer Termin fuer '{watch_item.name}' gefunden.\n\n"
f"Quelle: {source.label or source.url}\n"
f"Titel: {tracked_event.title}\n"
f"Ort: {tracked_event.venue_name or 'unbekannt'}\n"
f"Stadt: {tracked_event.city or 'unbekannt'}\n"
f"Datum: {tracked_event.event_date or 'unbekannt'}\n"
f"Tickets: {tracked_event.ticket_url or 'keine URL'}\n"
),
)
if status == NotificationStatus.sent:
tracked_event.discovery_notified_at = datetime.utcnow()
notifications_sent += 1
else:
notifications_skipped += 1
db.add(source)
db.commit()
for provider in providers:
try:
events = provider.search_events(
+446
View File
@@ -0,0 +1,446 @@
import json
import re
from datetime import datetime
from html import unescape
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from app.models import RegionScope, WatchItem, WatchSource, WatchType
from app.providers.utils import normalize_search_text
MONTH_ALIASES = {
"jan": 1,
"januar": 1,
"feb": 2,
"februar": 2,
"maer": 3,
"maerz": 3,
"mar": 3,
"maerz": 3,
"apr": 4,
"april": 4,
"mai": 5,
"jun": 6,
"juni": 6,
"jul": 7,
"juli": 7,
"aug": 8,
"august": 8,
"sep": 9,
"sept": 9,
"september": 9,
"okt": 10,
"oktober": 10,
"nov": 11,
"november": 11,
"dez": 12,
"dezember": 12,
}
class SourceScanner:
headers = {
"User-Agent": "eventlens/0.1 (+https://local)",
"Accept": "text/html,application/xhtml+xml,application/json",
"Accept-Language": "de-DE,de;q=0.9,en;q=0.7",
}
def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]:
response = requests.get(
source.url,
headers=self.headers,
timeout=30,
)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
if "application/json" in content_type:
return self._scan_json(watch_item, source, response.json())
return self._scan_html(watch_item, source, response.text)
def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]:
events = self._extract_jsonld_events(payload)
return self._events_from_jsonld(watch_item, source, events)
def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]:
soup = BeautifulSoup(html, "html.parser")
jsonld_events = []
for script in soup.find_all("script", type="application/ld+json"):
raw_payload = script.string or script.get_text()
if not raw_payload:
continue
try:
payload = json.loads(unescape(raw_payload))
except json.JSONDecodeError:
continue
jsonld_events.extend(self._extract_jsonld_events(payload))
jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
if jsonld_results:
return jsonld_results
return self._events_from_html_text(watch_item, source, soup)
def _extract_jsonld_events(self, payload) -> list[dict]:
events: list[dict] = []
if isinstance(payload, list):
for item in payload:
events.extend(self._extract_jsonld_events(item))
return events
if not isinstance(payload, dict):
return events
graph = payload.get("@graph")
if isinstance(graph, list):
for item in graph:
events.extend(self._extract_jsonld_events(item))
item_type = payload.get("@type")
if isinstance(item_type, list):
is_event = "Event" in item_type
else:
is_event = item_type == "Event"
if is_event:
events.append(payload)
return events
def _events_from_jsonld(
self,
watch_item: WatchItem,
source: WatchSource,
events: list[dict],
) -> list[dict]:
results: list[dict] = []
normalized_term = normalize_search_text(watch_item.name)
for event in events:
title = event.get("name") or ""
performers = self._extract_performer_names(event)
haystack = normalize_search_text(" ".join([title] + performers))
if normalized_term not in haystack:
continue
location = event.get("location") or {}
address = location.get("address") or {}
city = address.get("addressLocality") or location.get("name")
if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
continue
event_date = self._parse_datetime(event.get("startDate"))
if event_date and event_date.date() < datetime.utcnow().date():
continue
ticket_url = event.get("url") or source.url
results.append(
{
"external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
"title": title or watch_item.name,
"matched_term": watch_item.name,
"venue_name": location.get("name") or source.label,
"city": city,
"country_code": "DE",
"event_date": event_date,
"ticket_url": ticket_url,
"image_url": self._extract_image(event),
"raw_payload": event,
}
)
return results
def _events_from_html_text(
self,
watch_item: WatchItem,
source: WatchSource,
soup: BeautifulSoup,
) -> list[dict]:
text = soup.get_text(" ", strip=True)
normalized_text = normalize_search_text(text)
normalized_term = normalize_search_text(watch_item.name)
if normalized_term not in normalized_text:
return []
results: list[dict] = []
seen_keys: set[str] = set()
for context in self._find_matching_contexts(soup, watch_item):
context_text = context.get_text(" ", strip=True)
event_date = self._find_nearest_date(context_text, watch_item.name)
if event_date is None:
continue
if event_date.date() < datetime.utcnow().date():
continue
if (
watch_item.region_scope == RegionScope.hamburg
and "hamburg" not in normalize_search_text(context_text)
):
continue
title = self._find_title(context, watch_item.name)
link = self._find_nearest_link(context, watch_item.name, source.url) or source.url
key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}"
if key in seen_keys:
continue
seen_keys.add(key)
results.append(
{
"external_id": key,
"title": title,
"matched_term": watch_item.name,
"venue_name": self._find_venue(context_text, source.label),
"city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
"country_code": "DE",
"event_date": event_date,
"ticket_url": link,
"image_url": None,
"raw_payload": {
"source_url": source.url,
"parser": "html_text",
"context": context_text[:1000],
},
}
)
return results
def _extract_performer_names(self, event: dict) -> list[str]:
performer = event.get("performer") or event.get("performers")
if isinstance(performer, dict):
return [performer.get("name", "")]
if isinstance(performer, list):
return [item.get("name", "") for item in performer if isinstance(item, dict)]
return []
def _extract_image(self, event: dict) -> str | None:
image = event.get("image")
if isinstance(image, str):
return image
if isinstance(image, list):
for item in image:
if isinstance(item, str):
return item
return None
def _parse_datetime(self, value: str | None) -> datetime | None:
if not value:
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)
except ValueError:
pass
for fmt in ("%d.%m.%Y", "%Y-%m-%d"):
try:
return datetime.strptime(value[:10], fmt)
except ValueError:
continue
return None
def _find_nearest_date(self, text: str, term: str) -> datetime | None:
normalized_term = normalize_search_text(term)
normalized_text = normalize_search_text(text)
term_index = normalized_text.find(normalized_term)
search_area = text
if term_index >= 0:
start = max(0, term_index - 300)
end = min(len(text), term_index + 500)
search_area = text[start:end]
candidates: list[datetime] = []
for pattern in (
r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b",
r"\b(\d{1,2}\.\d{1,2}\.)\b",
):
for match in re.finditer(pattern, search_area):
parsed = self._parse_german_date(match.group(1))
if parsed:
candidates.append(parsed)
month_name_pattern = (
r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|ä)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|"
r"jul(?:i)?|aug(?:ust)?|sep(?:t|tember)?|okt(?:ober)?|nov(?:ember)?|dez(?:ember)?"
)
for match in re.finditer(
rf"\b(\d{{1,2}})\.?\s+({month_name_pattern})\.?\s*(\d{{4}})?\b",
search_area,
re.IGNORECASE,
):
parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3))
if parsed:
candidates.append(parsed)
for match in re.finditer(
rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b",
search_area,
re.IGNORECASE,
):
parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3))
if parsed:
candidates.append(parsed)
future_candidates = [
candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date()
]
if future_candidates:
return sorted(future_candidates)[0]
return sorted(candidates)[0] if candidates else None
def _parse_german_date(self, value: str) -> datetime | None:
cleaned = value.strip()
current_year = datetime.utcnow().year
candidates = [cleaned]
if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
candidates.append(f"{cleaned}{current_year}")
candidates.append(f"{cleaned}{current_year + 1}")
elif re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}", cleaned):
day, month, year = cleaned.split(".")
candidates.append(f"{day}.{month}.20{year}")
for candidate in candidates:
try:
parsed = datetime.strptime(candidate, "%d.%m.%Y")
if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
continue
return parsed
except ValueError:
continue
return None
def _parse_named_month_date(
self,
day_value: str,
month_value: str,
year_value: str | None,
) -> datetime | None:
month = MONTH_ALIASES.get(normalize_search_text(month_value).rstrip("."))
if month is None:
return None
day = int(day_value)
current_year = datetime.utcnow().year
years = [int(year_value)] if year_value else [current_year, current_year + 1]
for year in years:
try:
parsed = datetime(year, month, day)
except ValueError:
continue
if year_value or parsed.date() >= datetime.utcnow().date():
return parsed
return None
def _find_matching_contexts(self, soup: BeautifulSoup, watch_item: WatchItem) -> list:
normalized_term = normalize_search_text(watch_item.name)
selectors = [
"li.card",
".tourplan .row",
"[class*=event]",
"[class*=termin]",
"article",
"tr",
"li",
".row",
]
candidates = []
seen_nodes = set()
for selector in selectors:
for node in soup.select(selector):
if id(node) in seen_nodes:
continue
seen_nodes.add(id(node))
text = node.get_text(" ", strip=True)
if normalized_term not in normalize_search_text(text):
continue
if len(text) > 3500:
continue
if self._find_nearest_date(text, watch_item.name):
candidates.append(node)
if candidates:
return candidates
fallback = self._find_best_context(soup, watch_item.name)
return [fallback] if fallback is not None else []
def _find_venue(self, text: str, default: str) -> str:
lines = [line.strip() for line in re.split(r"\s{2,}|\n|\r", text) if line.strip()]
for line in lines:
normalized = normalize_search_text(line)
if "hamburg" in normalized and len(line) <= 120:
return line
return default
def _find_best_context(self, soup: BeautifulSoup, term: str):
normalized_term = normalize_search_text(term)
candidates = []
for node in soup.find_all(string=True):
if normalized_term in normalize_search_text(str(node)):
parent = node.parent
if parent is None:
continue
best_parent = self._climb_to_context_with_date(parent, term)
text = best_parent.get_text(" ", strip=True)
candidates.append(
(
0 if self._find_nearest_date(text, term) else 1,
len(text),
best_parent,
)
)
if not candidates:
return None
candidates.sort(key=lambda item: (item[0], item[1]))
return candidates[0][2]
def _climb_to_context_with_date(self, node, term: str):
current = node
best = node
for _ in range(6):
if current is None:
break
context_text = current.get_text(" ", strip=True)
if self._find_nearest_date(context_text, term):
return current
best = current
current = current.parent
return best
def _find_title(self, soup: BeautifulSoup, term: str) -> str:
if soup is None:
return term
normalized_term = normalize_search_text(term)
for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
title = heading.get_text(" ", strip=True)
if normalized_term in normalize_search_text(title):
return title
text = soup.get_text(" ", strip=True)
dated_match = re.search(
r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
+ re.escape(term)
+ r".{0,100})",
text,
re.IGNORECASE,
)
if dated_match:
return " ".join(dated_match.group(1).split())
match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
if match:
return " ".join(match.group(1).split())
return term
def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
normalized_term = normalize_search_text(term)
for link in soup.find_all("a", href=True):
if normalized_term in normalize_search_text(link.get_text(" ", strip=True)):
return urljoin(base_url, link["href"])
return None