auf Webseiten Modell umgestellt
This commit is contained in:
@@ -114,3 +114,4 @@ sudo docker compose up -d --build
|
|||||||
- Bandsintown benoetigt eine echte, von Bandsintown freigeschaltete App-ID. Ohne diese wird der Provider deaktiviert oder als `blocked` angezeigt.
|
- Bandsintown benoetigt eine echte, von Bandsintown freigeschaltete App-ID. Ohne diese wird der Provider deaktiviert oder als `blocked` angezeigt.
|
||||||
- Barclays Arena wird ueber die offizielle Eventseite der Arena abgefragt.
|
- Barclays Arena wird ueber die offizielle Eventseite der Arena abgefragt.
|
||||||
- Fabrik wird ueber die offizielle Veranstaltungsseite der Fabrik Hamburg abgefragt.
|
- Fabrik wird ueber die offizielle Veranstaltungsseite der Fabrik Hamburg abgefragt.
|
||||||
|
- Fuer robuste persoenliche Ueberwachung koennen pro Watchlist-Eintrag direkte Quellen-URLs hinterlegt werden. Diese werden beim Sync gezielt per JSON-LD und HTML-Textscan durchsucht.
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ const state = {
|
|||||||
events: [],
|
events: [],
|
||||||
notifications: [],
|
notifications: [],
|
||||||
providerStatuses: [],
|
providerStatuses: [],
|
||||||
|
watchSources: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
const watchItemsEl = document.querySelector("#watch-items");
|
const watchItemsEl = document.querySelector("#watch-items");
|
||||||
@@ -92,6 +93,12 @@ function renderStats() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function prettifyProviderName(value) {
|
function prettifyProviderName(value) {
|
||||||
|
if (value?.startsWith("source:")) {
|
||||||
|
const sourceId = Number(value.split(":")[1]);
|
||||||
|
const source = state.watchSources.find((entry) => entry.id === sourceId);
|
||||||
|
return source?.label || "Direkte Quelle";
|
||||||
|
}
|
||||||
|
|
||||||
const names = {
|
const names = {
|
||||||
ticketmaster: "Ticketmaster",
|
ticketmaster: "Ticketmaster",
|
||||||
bandsintown: "Bandsintown",
|
bandsintown: "Bandsintown",
|
||||||
@@ -206,12 +213,71 @@ function renderWatchItems() {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<p>${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}</p>
|
<p>${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}</p>
|
||||||
|
${renderSourceList(item.id)}
|
||||||
|
<form class="source-form" data-watch-id="${item.id}">
|
||||||
|
<input
|
||||||
|
name="label"
|
||||||
|
type="text"
|
||||||
|
placeholder="Quelle, z. B. Kuenstlerseite"
|
||||||
|
/>
|
||||||
|
<input
|
||||||
|
name="url"
|
||||||
|
type="url"
|
||||||
|
placeholder="https://..."
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit" class="action-button success">Quelle hinzufuegen</button>
|
||||||
|
</form>
|
||||||
</article>
|
</article>
|
||||||
`
|
`
|
||||||
)
|
)
|
||||||
.join("");
|
.join("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function renderSourceList(watchItemId) {
|
||||||
|
const sources = state.watchSources.filter((source) => source.watch_item_id === watchItemId);
|
||||||
|
if (!sources.length) {
|
||||||
|
return '<div class="source-list muted">Noch keine direkten Quellen hinterlegt.</div>';
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="source-list">
|
||||||
|
${sources
|
||||||
|
.map(
|
||||||
|
(source) => `
|
||||||
|
<div class="source-row">
|
||||||
|
<div>
|
||||||
|
<strong>${escapeHtml(source.label || "Quelle")}</strong>
|
||||||
|
<a href="${escapeHtml(source.url)}" target="_blank" rel="noreferrer">
|
||||||
|
${escapeHtml(source.url)}
|
||||||
|
</a>
|
||||||
|
<div class="pill-row">
|
||||||
|
<span class="pill ${
|
||||||
|
source.last_status === "ok"
|
||||||
|
? "success"
|
||||||
|
: source.last_status === "error"
|
||||||
|
? "danger"
|
||||||
|
: "warning"
|
||||||
|
}">${escapeHtml(source.last_status)}</span>
|
||||||
|
<span class="muted">${escapeHtml(source.last_message || "Noch nicht gescannt.")}</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="action-row">
|
||||||
|
<button class="action-button" data-action="toggle-source" data-id="${source.id}">
|
||||||
|
${source.is_active ? "Pausieren" : "Aktivieren"}
|
||||||
|
</button>
|
||||||
|
<button class="action-button danger" data-action="delete-source" data-id="${source.id}">
|
||||||
|
Loeschen
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
)
|
||||||
|
.join("")}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
function getWatchNameById(id) {
|
function getWatchNameById(id) {
|
||||||
return state.watchItems.find((item) => item.id === id)?.name || `Watch #${id}`;
|
return state.watchItems.find((item) => item.id === id)?.name || `Watch #${id}`;
|
||||||
}
|
}
|
||||||
@@ -270,6 +336,13 @@ function renderEvents() {
|
|||||||
>
|
>
|
||||||
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
|
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
|
||||||
</button>
|
</button>
|
||||||
|
<button
|
||||||
|
class="action-button danger"
|
||||||
|
data-action="delete-event"
|
||||||
|
data-id="${event.id}"
|
||||||
|
>
|
||||||
|
Loeschen
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="event-meta">
|
<div class="event-meta">
|
||||||
@@ -328,17 +401,19 @@ function updateSyncStatus(message) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function loadData() {
|
async function loadData() {
|
||||||
const [watchItems, events, notifications, providerStatuses] = await Promise.all([
|
const [watchItems, events, notifications, providerStatuses, watchSources] = await Promise.all([
|
||||||
apiFetch("/watch-items"),
|
apiFetch("/watch-items"),
|
||||||
apiFetch("/events"),
|
apiFetch("/events"),
|
||||||
apiFetch("/notifications"),
|
apiFetch("/notifications"),
|
||||||
apiFetch("/provider-statuses"),
|
apiFetch("/provider-statuses"),
|
||||||
|
apiFetch("/watch-sources"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
state.watchItems = watchItems;
|
state.watchItems = watchItems;
|
||||||
state.events = events;
|
state.events = events;
|
||||||
state.notifications = notifications;
|
state.notifications = notifications;
|
||||||
state.providerStatuses = providerStatuses;
|
state.providerStatuses = providerStatuses;
|
||||||
|
state.watchSources = watchSources;
|
||||||
|
|
||||||
renderStats();
|
renderStats();
|
||||||
renderWatchItems();
|
renderWatchItems();
|
||||||
@@ -433,7 +508,60 @@ document.addEventListener("click", async (event) => {
|
|||||||
});
|
});
|
||||||
await loadData();
|
await loadData();
|
||||||
showToast("Ticketstatus aktualisiert.");
|
showToast("Ticketstatus aktualisiert.");
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (action === "delete-event") {
|
||||||
|
await apiFetch(`/events/${id}`, { method: "DELETE" });
|
||||||
|
await loadData();
|
||||||
|
showToast("Event geloescht.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (action === "delete-source") {
|
||||||
|
await apiFetch(`/watch-sources/${id}`, { method: "DELETE" });
|
||||||
|
await loadData();
|
||||||
|
showToast("Quelle geloescht.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (action === "toggle-source") {
|
||||||
|
const source = state.watchSources.find((entry) => entry.id === Number(id));
|
||||||
|
await apiFetch(`/watch-sources/${id}`, {
|
||||||
|
method: "PATCH",
|
||||||
|
body: JSON.stringify({ is_active: !source.is_active }),
|
||||||
|
});
|
||||||
|
await loadData();
|
||||||
|
showToast("Quellenstatus aktualisiert.");
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
showToast(error.message);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
document.addEventListener("submit", async (event) => {
|
||||||
|
const form = event.target.closest(".source-form");
|
||||||
|
if (!form) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
event.preventDefault();
|
||||||
|
const watchId = form.dataset.watchId;
|
||||||
|
const formData = new FormData(form);
|
||||||
|
const payload = {
|
||||||
|
label: formData.get("label")?.toString().trim() || null,
|
||||||
|
url: formData.get("url")?.toString().trim(),
|
||||||
|
parser_type: "auto",
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
await apiFetch(`/watch-items/${watchId}/sources`, {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
});
|
||||||
|
form.reset();
|
||||||
|
await loadData();
|
||||||
|
showToast("Quelle hinzugefuegt.");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
showToast(error.message);
|
showToast(error.message);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -381,6 +381,37 @@ button:hover,
|
|||||||
line-height: 1.55;
|
line-height: 1.55;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.source-list {
|
||||||
|
display: grid;
|
||||||
|
gap: 10px;
|
||||||
|
margin-top: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.source-row {
|
||||||
|
display: flex;
|
||||||
|
align-items: start;
|
||||||
|
justify-content: space-between;
|
||||||
|
gap: 16px;
|
||||||
|
padding: 14px;
|
||||||
|
border-radius: var(--radius-sm);
|
||||||
|
background: rgba(46, 39, 30, 0.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
.source-row a {
|
||||||
|
display: block;
|
||||||
|
max-width: 46ch;
|
||||||
|
margin: 4px 0 8px;
|
||||||
|
overflow-wrap: anywhere;
|
||||||
|
color: var(--primary-dark);
|
||||||
|
}
|
||||||
|
|
||||||
|
.source-form {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 0.7fr 1.4fr auto;
|
||||||
|
gap: 10px;
|
||||||
|
margin-top: 14px;
|
||||||
|
}
|
||||||
|
|
||||||
.action-button {
|
.action-button {
|
||||||
min-height: 38px;
|
min-height: 38px;
|
||||||
padding: 0 14px;
|
padding: 0 14px;
|
||||||
@@ -467,6 +498,7 @@ button:hover,
|
|||||||
}
|
}
|
||||||
|
|
||||||
.watch-form,
|
.watch-form,
|
||||||
|
.source-form,
|
||||||
.status-panel {
|
.status-panel {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
|
|||||||
+75
-1
@@ -8,7 +8,7 @@ from sqlalchemy.orm import Session
|
|||||||
|
|
||||||
from app.config import settings
|
from app.config import settings
|
||||||
from app.database import Base, engine, get_db
|
from app.database import Base, engine, get_db
|
||||||
from app.models import TrackedEvent, WatchItem
|
from app.models import TrackedEvent, WatchItem, WatchSource
|
||||||
from app.scheduler import start_scheduler
|
from app.scheduler import start_scheduler
|
||||||
from app.schemas import (
|
from app.schemas import (
|
||||||
NotificationLogRead,
|
NotificationLogRead,
|
||||||
@@ -19,11 +19,15 @@ from app.schemas import (
|
|||||||
WatchItemCreate,
|
WatchItemCreate,
|
||||||
WatchItemRead,
|
WatchItemRead,
|
||||||
WatchItemUpdate,
|
WatchItemUpdate,
|
||||||
|
WatchSourceCreate,
|
||||||
|
WatchSourceRead,
|
||||||
|
WatchSourceUpdate,
|
||||||
)
|
)
|
||||||
from app.services import (
|
from app.services import (
|
||||||
list_events,
|
list_events,
|
||||||
list_notifications,
|
list_notifications,
|
||||||
list_provider_statuses,
|
list_provider_statuses,
|
||||||
|
list_watch_sources,
|
||||||
list_watch_items,
|
list_watch_items,
|
||||||
run_sync,
|
run_sync,
|
||||||
)
|
)
|
||||||
@@ -114,6 +118,66 @@ def delete_watch_item(watch_item_id: int, db: Session = Depends(get_db)):
|
|||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/watch-sources", response_model=list[WatchSourceRead])
|
||||||
|
def get_watch_sources(watch_item_id: int | None = None, db: Session = Depends(get_db)):
|
||||||
|
return list_watch_sources(db, watch_item_id)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post(
|
||||||
|
"/watch-items/{watch_item_id}/sources",
|
||||||
|
response_model=WatchSourceRead,
|
||||||
|
status_code=201,
|
||||||
|
)
|
||||||
|
def create_watch_source(
|
||||||
|
watch_item_id: int,
|
||||||
|
payload: WatchSourceCreate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
watch_item = db.get(WatchItem, watch_item_id)
|
||||||
|
if watch_item is None:
|
||||||
|
raise HTTPException(status_code=404, detail="Watch item nicht gefunden.")
|
||||||
|
|
||||||
|
source = WatchSource(
|
||||||
|
watch_item=watch_item,
|
||||||
|
label=payload.label,
|
||||||
|
url=payload.url,
|
||||||
|
parser_type=payload.parser_type,
|
||||||
|
)
|
||||||
|
db.add(source)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(source)
|
||||||
|
return source
|
||||||
|
|
||||||
|
|
||||||
|
@app.patch("/watch-sources/{source_id}", response_model=WatchSourceRead)
|
||||||
|
def update_watch_source(
|
||||||
|
source_id: int,
|
||||||
|
payload: WatchSourceUpdate,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
):
|
||||||
|
source = db.get(WatchSource, source_id)
|
||||||
|
if source is None:
|
||||||
|
raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
|
||||||
|
|
||||||
|
updates = payload.model_dump(exclude_unset=True)
|
||||||
|
for field_name, value in updates.items():
|
||||||
|
setattr(source, field_name, value)
|
||||||
|
source.updated_at = datetime.utcnow()
|
||||||
|
db.commit()
|
||||||
|
db.refresh(source)
|
||||||
|
return source
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/watch-sources/{source_id}", status_code=204)
|
||||||
|
def delete_watch_source(source_id: int, db: Session = Depends(get_db)):
|
||||||
|
source = db.get(WatchSource, source_id)
|
||||||
|
if source is None:
|
||||||
|
raise HTTPException(status_code=404, detail="Quelle nicht gefunden.")
|
||||||
|
|
||||||
|
db.delete(source)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
|
||||||
@app.get("/events", response_model=list[TrackedEventRead])
|
@app.get("/events", response_model=list[TrackedEventRead])
|
||||||
def get_events(db: Session = Depends(get_db)):
|
def get_events(db: Session = Depends(get_db)):
|
||||||
return list_events(db)
|
return list_events(db)
|
||||||
@@ -137,6 +201,16 @@ def update_purchase_status(
|
|||||||
return tracked_event
|
return tracked_event
|
||||||
|
|
||||||
|
|
||||||
|
@app.delete("/events/{event_id}", status_code=204)
|
||||||
|
def delete_event(event_id: int, db: Session = Depends(get_db)):
|
||||||
|
tracked_event = db.get(TrackedEvent, event_id)
|
||||||
|
if tracked_event is None:
|
||||||
|
raise HTTPException(status_code=404, detail="Event nicht gefunden.")
|
||||||
|
|
||||||
|
db.delete(tracked_event)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
|
||||||
@app.get("/notifications", response_model=list[NotificationLogRead])
|
@app.get("/notifications", response_model=list[NotificationLogRead])
|
||||||
def get_notifications(db: Session = Depends(get_db)):
|
def get_notifications(db: Session = Depends(get_db)):
|
||||||
return list_notifications(db)
|
return list_notifications(db)
|
||||||
|
|||||||
@@ -35,6 +35,13 @@ class ProviderStatusType(str, Enum):
|
|||||||
error = "error"
|
error = "error"
|
||||||
|
|
||||||
|
|
||||||
|
class SourceStatusType(str, Enum):
|
||||||
|
pending = "pending"
|
||||||
|
ok = "ok"
|
||||||
|
no_match = "no_match"
|
||||||
|
error = "error"
|
||||||
|
|
||||||
|
|
||||||
class WatchItem(Base):
|
class WatchItem(Base):
|
||||||
__tablename__ = "watch_items"
|
__tablename__ = "watch_items"
|
||||||
|
|
||||||
@@ -57,6 +64,36 @@ class WatchItem(Base):
|
|||||||
tracked_events: Mapped[list["TrackedEvent"]] = relationship(
|
tracked_events: Mapped[list["TrackedEvent"]] = relationship(
|
||||||
back_populates="watch_item", cascade="all, delete-orphan"
|
back_populates="watch_item", cascade="all, delete-orphan"
|
||||||
)
|
)
|
||||||
|
sources: Mapped[list["WatchSource"]] = relationship(
|
||||||
|
back_populates="watch_item", cascade="all, delete-orphan"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WatchSource(Base):
|
||||||
|
__tablename__ = "watch_sources"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, index=True)
|
||||||
|
watch_item_id: Mapped[int] = mapped_column(ForeignKey("watch_items.id"), nullable=False)
|
||||||
|
label: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
url: Mapped[str] = mapped_column(String(1024), nullable=False)
|
||||||
|
parser_type: Mapped[str] = mapped_column(String(50), default="auto", nullable=False)
|
||||||
|
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
||||||
|
last_status: Mapped[SourceStatusType] = mapped_column(
|
||||||
|
SqlEnum(SourceStatusType), default=SourceStatusType.pending, nullable=False
|
||||||
|
)
|
||||||
|
last_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
last_checked_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
||||||
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
|
)
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime,
|
||||||
|
default=datetime.utcnow,
|
||||||
|
onupdate=datetime.utcnow,
|
||||||
|
nullable=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
watch_item: Mapped[WatchItem] = relationship(back_populates="sources")
|
||||||
|
|
||||||
|
|
||||||
class TrackedEvent(Base):
|
class TrackedEvent(Base):
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import re
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
@@ -10,7 +11,7 @@ from app.providers.utils import normalize_search_text
|
|||||||
|
|
||||||
class BarclaysArenaProvider:
|
class BarclaysArenaProvider:
|
||||||
source_name = "barclays_arena"
|
source_name = "barclays_arena"
|
||||||
events_url = "https://www.barclays-arena.de/events"
|
events_url = "https://www.barclays-arena.de/events/search"
|
||||||
|
|
||||||
def search_events(
|
def search_events(
|
||||||
self,
|
self,
|
||||||
@@ -29,73 +30,104 @@ class BarclaysArenaProvider:
|
|||||||
normalized_term = normalize_search_text(term)
|
normalized_term = normalize_search_text(term)
|
||||||
results: list[dict] = []
|
results: list[dict] = []
|
||||||
|
|
||||||
headings = soup.find_all("h3")
|
for heading in soup.find_all("h3"):
|
||||||
for heading in headings:
|
|
||||||
title = heading.get_text(" ", strip=True)
|
title = heading.get_text(" ", strip=True)
|
||||||
if not title:
|
if not title:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
subtitle = ""
|
subtitle_el = heading.find_next_sibling("h4")
|
||||||
subtitle_el = heading.find_next("h4")
|
subtitle = subtitle_el.get_text(" ", strip=True) if subtitle_el else ""
|
||||||
if subtitle_el:
|
|
||||||
subtitle = subtitle_el.get_text(" ", strip=True)
|
|
||||||
|
|
||||||
|
# Keep matching local to the actual heading/subtitle pair. Wider
|
||||||
|
# parent containers often contain several event cards.
|
||||||
haystack = normalize_search_text(f"{title} {subtitle}")
|
haystack = normalize_search_text(f"{title} {subtitle}")
|
||||||
if normalized_term not in haystack:
|
if normalized_term not in haystack:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
date_text = self._find_previous_date_text(heading)
|
detail_link = self._find_card_link(heading)
|
||||||
event_date = self._parse_german_date(date_text)
|
if detail_link is None:
|
||||||
|
|
||||||
link = heading.find_previous("a", href=True)
|
|
||||||
if link is None:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
date_text = self._find_card_date_text(heading)
|
||||||
|
event_date = self._parse_german_date(date_text)
|
||||||
|
href = detail_link["href"]
|
||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
{
|
{
|
||||||
"external_id": link["href"],
|
"external_id": href,
|
||||||
"title": title,
|
"title": title,
|
||||||
"matched_term": term,
|
"matched_term": term,
|
||||||
"venue_name": "Barclays Arena",
|
"venue_name": "Barclays Arena",
|
||||||
"city": "Hamburg",
|
"city": "Hamburg",
|
||||||
"country_code": "DE",
|
"country_code": "DE",
|
||||||
"event_date": event_date,
|
"event_date": event_date,
|
||||||
"ticket_url": urljoin(self.events_url, link["href"]),
|
"ticket_url": urljoin(self.events_url, href),
|
||||||
"image_url": None,
|
"image_url": None,
|
||||||
"raw_payload": {
|
"raw_payload": {
|
||||||
"title": title,
|
"title": title,
|
||||||
"subtitle": subtitle,
|
"subtitle": subtitle,
|
||||||
"date_text": date_text,
|
"date_text": date_text,
|
||||||
"href": link["href"],
|
"href": href,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
unique_results: dict[str, dict] = {}
|
||||||
|
for result in results:
|
||||||
|
unique_results[result["external_id"]] = result
|
||||||
|
|
||||||
self.last_status = "ok"
|
self.last_status = "ok"
|
||||||
self.last_message = (
|
self.last_message = (
|
||||||
f"Barclays Arena returned {len(results)} matched events for term '{term}'."
|
f"Barclays Arena returned {len(unique_results)} matched events for term '{term}'."
|
||||||
)
|
)
|
||||||
return results
|
return list(unique_results.values())
|
||||||
|
|
||||||
def _find_previous_date_text(self, heading) -> str | None:
|
def _find_card_link(self, heading):
|
||||||
current = heading.previous_sibling
|
link = heading.find_parent("a", href=re.compile(r"/events/"))
|
||||||
while current is not None:
|
if link is not None:
|
||||||
|
return link
|
||||||
|
|
||||||
|
current = heading
|
||||||
|
for _ in range(5):
|
||||||
|
current = current.parent
|
||||||
|
if current is None:
|
||||||
|
return None
|
||||||
|
link = current.find("a", href=re.compile(r"/events/"))
|
||||||
|
if link is not None and heading in link.find_all("h3"):
|
||||||
|
return link
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_card_date_text(self, heading) -> str | None:
|
||||||
|
current = heading
|
||||||
|
for _ in range(6):
|
||||||
|
current = current.previous_element
|
||||||
|
if current is None:
|
||||||
|
return None
|
||||||
text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
|
text = getattr(current, "get_text", lambda *args, **kwargs: str(current))(
|
||||||
" ", strip=True
|
" ", strip=True
|
||||||
)
|
)
|
||||||
if text and "|" in text:
|
date_text = self._extract_date_text(text)
|
||||||
return text
|
if date_text:
|
||||||
current = current.previous_sibling
|
return date_text
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_date_text(self, text: str) -> str | None:
|
||||||
|
match = re.search(
|
||||||
|
r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\s*\|\s*(\d{2}\.\d{2}\.\d{4})",
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
if match:
|
||||||
|
return match.group(2)
|
||||||
|
|
||||||
|
match = re.search(r"\b(\d{2}\.\d{2}\.\d{4})\b", text)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_german_date(self, value: str | None) -> datetime | None:
|
def _parse_german_date(self, value: str | None) -> datetime | None:
|
||||||
if not value:
|
if not value:
|
||||||
return None
|
return None
|
||||||
parts = [part.strip() for part in value.split("|")]
|
|
||||||
if len(parts) < 2:
|
|
||||||
return None
|
|
||||||
try:
|
try:
|
||||||
return datetime.strptime(parts[1], "%d.%m.%Y")
|
return datetime.strptime(value, "%d.%m.%Y")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from app.models import (
|
|||||||
NotificationType,
|
NotificationType,
|
||||||
ProviderStatusType,
|
ProviderStatusType,
|
||||||
RegionScope,
|
RegionScope,
|
||||||
|
SourceStatusType,
|
||||||
WatchType,
|
WatchType,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -39,6 +40,35 @@ class WatchItemRead(BaseModel):
|
|||||||
updated_at: datetime
|
updated_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class WatchSourceCreate(BaseModel):
|
||||||
|
label: str | None = Field(default=None, max_length=255)
|
||||||
|
url: str = Field(min_length=8, max_length=1024)
|
||||||
|
parser_type: str = "auto"
|
||||||
|
|
||||||
|
|
||||||
|
class WatchSourceUpdate(BaseModel):
|
||||||
|
label: str | None = Field(default=None, max_length=255)
|
||||||
|
url: str | None = Field(default=None, min_length=8, max_length=1024)
|
||||||
|
parser_type: str | None = None
|
||||||
|
is_active: bool | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class WatchSourceRead(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
id: int
|
||||||
|
watch_item_id: int
|
||||||
|
label: str | None
|
||||||
|
url: str
|
||||||
|
parser_type: str
|
||||||
|
is_active: bool
|
||||||
|
last_status: SourceStatusType
|
||||||
|
last_message: str | None
|
||||||
|
last_checked_at: datetime | None
|
||||||
|
created_at: datetime
|
||||||
|
updated_at: datetime
|
||||||
|
|
||||||
|
|
||||||
class PurchaseUpdate(BaseModel):
|
class PurchaseUpdate(BaseModel):
|
||||||
is_ticket_purchased: bool
|
is_ticket_purchased: bool
|
||||||
|
|
||||||
|
|||||||
@@ -10,12 +10,15 @@ from app.models import (
|
|||||||
NotificationType,
|
NotificationType,
|
||||||
ProviderStatus,
|
ProviderStatus,
|
||||||
ProviderStatusType,
|
ProviderStatusType,
|
||||||
|
SourceStatusType,
|
||||||
TrackedEvent,
|
TrackedEvent,
|
||||||
WatchItem,
|
WatchItem,
|
||||||
|
WatchSource,
|
||||||
)
|
)
|
||||||
from app.notifications import send_email_notification
|
from app.notifications import send_email_notification
|
||||||
from app.providers.registry import get_providers
|
from app.providers.registry import get_providers
|
||||||
from app.schemas import SyncResult
|
from app.schemas import SyncResult
|
||||||
|
from app.source_scanner import SourceScanner
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -71,6 +74,13 @@ def list_provider_statuses(db: Session) -> list[ProviderStatus]:
|
|||||||
return list(db.scalars(select(ProviderStatus).order_by(ProviderStatus.provider_name)))
|
return list(db.scalars(select(ProviderStatus).order_by(ProviderStatus.provider_name)))
|
||||||
|
|
||||||
|
|
||||||
|
def list_watch_sources(db: Session, watch_item_id: int | None = None) -> list[WatchSource]:
|
||||||
|
stmt = select(WatchSource).order_by(WatchSource.created_at)
|
||||||
|
if watch_item_id is not None:
|
||||||
|
stmt = stmt.where(WatchSource.watch_item_id == watch_item_id)
|
||||||
|
return list(db.scalars(stmt))
|
||||||
|
|
||||||
|
|
||||||
def update_provider_status(
|
def update_provider_status(
|
||||||
db: Session,
|
db: Session,
|
||||||
provider_name: str,
|
provider_name: str,
|
||||||
@@ -261,6 +271,7 @@ def upsert_event(
|
|||||||
|
|
||||||
def run_sync(db: Session) -> SyncResult:
|
def run_sync(db: Session) -> SyncResult:
|
||||||
providers = get_providers()
|
providers = get_providers()
|
||||||
|
source_scanner = SourceScanner()
|
||||||
provider_states = {
|
provider_states = {
|
||||||
provider.source_name: init_provider_sync_state(provider.source_name)
|
provider.source_name: init_provider_sync_state(provider.source_name)
|
||||||
for provider in providers
|
for provider in providers
|
||||||
@@ -275,6 +286,75 @@ def run_sync(db: Session) -> SyncResult:
|
|||||||
notifications_skipped = 0
|
notifications_skipped = 0
|
||||||
|
|
||||||
for watch_item in active_items:
|
for watch_item in active_items:
|
||||||
|
active_sources = [source for source in watch_item.sources if source.is_active]
|
||||||
|
for source in active_sources:
|
||||||
|
try:
|
||||||
|
events = source_scanner.scan(watch_item, source)
|
||||||
|
source.last_status = (
|
||||||
|
SourceStatusType.ok if events else SourceStatusType.no_match
|
||||||
|
)
|
||||||
|
source.last_message = (
|
||||||
|
f"{len(events)} passende Events gefunden."
|
||||||
|
if events
|
||||||
|
else "Keine passenden Events auf dieser Quelle gefunden."
|
||||||
|
)
|
||||||
|
source.last_checked_at = datetime.utcnow()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(
|
||||||
|
"Source scan failed for watch_item=%s source=%s",
|
||||||
|
watch_item.name,
|
||||||
|
source.url,
|
||||||
|
)
|
||||||
|
db.rollback()
|
||||||
|
source.last_status = SourceStatusType.error
|
||||||
|
source.last_message = f"Scan fehlgeschlagen: {exc}"
|
||||||
|
source.last_checked_at = datetime.utcnow()
|
||||||
|
db.add(source)
|
||||||
|
db.commit()
|
||||||
|
continue
|
||||||
|
|
||||||
|
for event_data in events:
|
||||||
|
tracked_event, is_new = upsert_event(
|
||||||
|
db=db,
|
||||||
|
watch_item=watch_item,
|
||||||
|
provider_name=f"source:{source.id}",
|
||||||
|
event_data=event_data,
|
||||||
|
)
|
||||||
|
if is_new:
|
||||||
|
new_events += 1
|
||||||
|
else:
|
||||||
|
updated_events += 1
|
||||||
|
|
||||||
|
should_notify = (
|
||||||
|
is_new
|
||||||
|
and tracked_event.discovery_notified_at is None
|
||||||
|
and not has_equivalent_existing_event(db, tracked_event)
|
||||||
|
)
|
||||||
|
if should_notify:
|
||||||
|
status = send_email_notification(
|
||||||
|
db=db,
|
||||||
|
tracked_event=tracked_event,
|
||||||
|
notification_type=NotificationType.discovery,
|
||||||
|
subject=f"Neuer Termin fuer {watch_item.name}",
|
||||||
|
body=(
|
||||||
|
f"Es wurde ein neuer Termin fuer '{watch_item.name}' gefunden.\n\n"
|
||||||
|
f"Quelle: {source.label or source.url}\n"
|
||||||
|
f"Titel: {tracked_event.title}\n"
|
||||||
|
f"Ort: {tracked_event.venue_name or 'unbekannt'}\n"
|
||||||
|
f"Stadt: {tracked_event.city or 'unbekannt'}\n"
|
||||||
|
f"Datum: {tracked_event.event_date or 'unbekannt'}\n"
|
||||||
|
f"Tickets: {tracked_event.ticket_url or 'keine URL'}\n"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if status == NotificationStatus.sent:
|
||||||
|
tracked_event.discovery_notified_at = datetime.utcnow()
|
||||||
|
notifications_sent += 1
|
||||||
|
else:
|
||||||
|
notifications_skipped += 1
|
||||||
|
|
||||||
|
db.add(source)
|
||||||
|
db.commit()
|
||||||
|
|
||||||
for provider in providers:
|
for provider in providers:
|
||||||
try:
|
try:
|
||||||
events = provider.search_events(
|
events = provider.search_events(
|
||||||
|
|||||||
@@ -0,0 +1,446 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from html import unescape
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from app.models import RegionScope, WatchItem, WatchSource, WatchType
|
||||||
|
from app.providers.utils import normalize_search_text
|
||||||
|
|
||||||
|
|
||||||
|
MONTH_ALIASES = {
|
||||||
|
"jan": 1,
|
||||||
|
"januar": 1,
|
||||||
|
"feb": 2,
|
||||||
|
"februar": 2,
|
||||||
|
"maer": 3,
|
||||||
|
"maerz": 3,
|
||||||
|
"mar": 3,
|
||||||
|
"maerz": 3,
|
||||||
|
"apr": 4,
|
||||||
|
"april": 4,
|
||||||
|
"mai": 5,
|
||||||
|
"jun": 6,
|
||||||
|
"juni": 6,
|
||||||
|
"jul": 7,
|
||||||
|
"juli": 7,
|
||||||
|
"aug": 8,
|
||||||
|
"august": 8,
|
||||||
|
"sep": 9,
|
||||||
|
"sept": 9,
|
||||||
|
"september": 9,
|
||||||
|
"okt": 10,
|
||||||
|
"oktober": 10,
|
||||||
|
"nov": 11,
|
||||||
|
"november": 11,
|
||||||
|
"dez": 12,
|
||||||
|
"dezember": 12,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SourceScanner:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "eventlens/0.1 (+https://local)",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/json",
|
||||||
|
"Accept-Language": "de-DE,de;q=0.9,en;q=0.7",
|
||||||
|
}
|
||||||
|
|
||||||
|
def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]:
|
||||||
|
response = requests.get(
|
||||||
|
source.url,
|
||||||
|
headers=self.headers,
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
content_type = response.headers.get("content-type", "")
|
||||||
|
if "application/json" in content_type:
|
||||||
|
return self._scan_json(watch_item, source, response.json())
|
||||||
|
|
||||||
|
return self._scan_html(watch_item, source, response.text)
|
||||||
|
|
||||||
|
def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]:
|
||||||
|
events = self._extract_jsonld_events(payload)
|
||||||
|
return self._events_from_jsonld(watch_item, source, events)
|
||||||
|
|
||||||
|
def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]:
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
jsonld_events = []
|
||||||
|
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
raw_payload = script.string or script.get_text()
|
||||||
|
if not raw_payload:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
payload = json.loads(unescape(raw_payload))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
jsonld_events.extend(self._extract_jsonld_events(payload))
|
||||||
|
|
||||||
|
jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
|
||||||
|
if jsonld_results:
|
||||||
|
return jsonld_results
|
||||||
|
|
||||||
|
return self._events_from_html_text(watch_item, source, soup)
|
||||||
|
|
||||||
|
def _extract_jsonld_events(self, payload) -> list[dict]:
|
||||||
|
events: list[dict] = []
|
||||||
|
if isinstance(payload, list):
|
||||||
|
for item in payload:
|
||||||
|
events.extend(self._extract_jsonld_events(item))
|
||||||
|
return events
|
||||||
|
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
return events
|
||||||
|
|
||||||
|
graph = payload.get("@graph")
|
||||||
|
if isinstance(graph, list):
|
||||||
|
for item in graph:
|
||||||
|
events.extend(self._extract_jsonld_events(item))
|
||||||
|
|
||||||
|
item_type = payload.get("@type")
|
||||||
|
if isinstance(item_type, list):
|
||||||
|
is_event = "Event" in item_type
|
||||||
|
else:
|
||||||
|
is_event = item_type == "Event"
|
||||||
|
if is_event:
|
||||||
|
events.append(payload)
|
||||||
|
|
||||||
|
return events
|
||||||
|
|
||||||
|
def _events_from_jsonld(
|
||||||
|
self,
|
||||||
|
watch_item: WatchItem,
|
||||||
|
source: WatchSource,
|
||||||
|
events: list[dict],
|
||||||
|
) -> list[dict]:
|
||||||
|
results: list[dict] = []
|
||||||
|
normalized_term = normalize_search_text(watch_item.name)
|
||||||
|
|
||||||
|
for event in events:
|
||||||
|
title = event.get("name") or ""
|
||||||
|
performers = self._extract_performer_names(event)
|
||||||
|
haystack = normalize_search_text(" ".join([title] + performers))
|
||||||
|
if normalized_term not in haystack:
|
||||||
|
continue
|
||||||
|
|
||||||
|
location = event.get("location") or {}
|
||||||
|
address = location.get("address") or {}
|
||||||
|
city = address.get("addressLocality") or location.get("name")
|
||||||
|
if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
|
||||||
|
continue
|
||||||
|
|
||||||
|
event_date = self._parse_datetime(event.get("startDate"))
|
||||||
|
if event_date and event_date.date() < datetime.utcnow().date():
|
||||||
|
continue
|
||||||
|
ticket_url = event.get("url") or source.url
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
|
||||||
|
"title": title or watch_item.name,
|
||||||
|
"matched_term": watch_item.name,
|
||||||
|
"venue_name": location.get("name") or source.label,
|
||||||
|
"city": city,
|
||||||
|
"country_code": "DE",
|
||||||
|
"event_date": event_date,
|
||||||
|
"ticket_url": ticket_url,
|
||||||
|
"image_url": self._extract_image(event),
|
||||||
|
"raw_payload": event,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _events_from_html_text(
|
||||||
|
self,
|
||||||
|
watch_item: WatchItem,
|
||||||
|
source: WatchSource,
|
||||||
|
soup: BeautifulSoup,
|
||||||
|
) -> list[dict]:
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
normalized_text = normalize_search_text(text)
|
||||||
|
normalized_term = normalize_search_text(watch_item.name)
|
||||||
|
if normalized_term not in normalized_text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
results: list[dict] = []
|
||||||
|
seen_keys: set[str] = set()
|
||||||
|
for context in self._find_matching_contexts(soup, watch_item):
|
||||||
|
context_text = context.get_text(" ", strip=True)
|
||||||
|
event_date = self._find_nearest_date(context_text, watch_item.name)
|
||||||
|
if event_date is None:
|
||||||
|
continue
|
||||||
|
if event_date.date() < datetime.utcnow().date():
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
watch_item.region_scope == RegionScope.hamburg
|
||||||
|
and "hamburg" not in normalize_search_text(context_text)
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = self._find_title(context, watch_item.name)
|
||||||
|
link = self._find_nearest_link(context, watch_item.name, source.url) or source.url
|
||||||
|
key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}"
|
||||||
|
if key in seen_keys:
|
||||||
|
continue
|
||||||
|
seen_keys.add(key)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
"external_id": key,
|
||||||
|
"title": title,
|
||||||
|
"matched_term": watch_item.name,
|
||||||
|
"venue_name": self._find_venue(context_text, source.label),
|
||||||
|
"city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
|
||||||
|
"country_code": "DE",
|
||||||
|
"event_date": event_date,
|
||||||
|
"ticket_url": link,
|
||||||
|
"image_url": None,
|
||||||
|
"raw_payload": {
|
||||||
|
"source_url": source.url,
|
||||||
|
"parser": "html_text",
|
||||||
|
"context": context_text[:1000],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _extract_performer_names(self, event: dict) -> list[str]:
|
||||||
|
performer = event.get("performer") or event.get("performers")
|
||||||
|
if isinstance(performer, dict):
|
||||||
|
return [performer.get("name", "")]
|
||||||
|
if isinstance(performer, list):
|
||||||
|
return [item.get("name", "") for item in performer if isinstance(item, dict)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _extract_image(self, event: dict) -> str | None:
|
||||||
|
image = event.get("image")
|
||||||
|
if isinstance(image, str):
|
||||||
|
return image
|
||||||
|
if isinstance(image, list):
|
||||||
|
for item in image:
|
||||||
|
if isinstance(item, str):
|
||||||
|
return item
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_datetime(self, value: str | None) -> datetime | None:
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(value.replace("Z", "+00:00")).replace(tzinfo=None)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
for fmt in ("%d.%m.%Y", "%Y-%m-%d"):
|
||||||
|
try:
|
||||||
|
return datetime.strptime(value[:10], fmt)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_nearest_date(self, text: str, term: str) -> datetime | None:
|
||||||
|
normalized_term = normalize_search_text(term)
|
||||||
|
normalized_text = normalize_search_text(text)
|
||||||
|
term_index = normalized_text.find(normalized_term)
|
||||||
|
search_area = text
|
||||||
|
if term_index >= 0:
|
||||||
|
start = max(0, term_index - 300)
|
||||||
|
end = min(len(text), term_index + 500)
|
||||||
|
search_area = text[start:end]
|
||||||
|
|
||||||
|
candidates: list[datetime] = []
|
||||||
|
for pattern in (
|
||||||
|
r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
|
||||||
|
r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b",
|
||||||
|
r"\b(\d{1,2}\.\d{1,2}\.)\b",
|
||||||
|
):
|
||||||
|
for match in re.finditer(pattern, search_area):
|
||||||
|
parsed = self._parse_german_date(match.group(1))
|
||||||
|
if parsed:
|
||||||
|
candidates.append(parsed)
|
||||||
|
|
||||||
|
month_name_pattern = (
|
||||||
|
r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|ä)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|"
|
||||||
|
r"jul(?:i)?|aug(?:ust)?|sep(?:t|tember)?|okt(?:ober)?|nov(?:ember)?|dez(?:ember)?"
|
||||||
|
)
|
||||||
|
for match in re.finditer(
|
||||||
|
rf"\b(\d{{1,2}})\.?\s+({month_name_pattern})\.?\s*(\d{{4}})?\b",
|
||||||
|
search_area,
|
||||||
|
re.IGNORECASE,
|
||||||
|
):
|
||||||
|
parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3))
|
||||||
|
if parsed:
|
||||||
|
candidates.append(parsed)
|
||||||
|
for match in re.finditer(
|
||||||
|
rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b",
|
||||||
|
search_area,
|
||||||
|
re.IGNORECASE,
|
||||||
|
):
|
||||||
|
parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3))
|
||||||
|
if parsed:
|
||||||
|
candidates.append(parsed)
|
||||||
|
|
||||||
|
future_candidates = [
|
||||||
|
candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date()
|
||||||
|
]
|
||||||
|
if future_candidates:
|
||||||
|
return sorted(future_candidates)[0]
|
||||||
|
return sorted(candidates)[0] if candidates else None
|
||||||
|
|
||||||
|
def _parse_german_date(self, value: str) -> datetime | None:
|
||||||
|
cleaned = value.strip()
|
||||||
|
current_year = datetime.utcnow().year
|
||||||
|
candidates = [cleaned]
|
||||||
|
if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
|
||||||
|
candidates.append(f"{cleaned}{current_year}")
|
||||||
|
candidates.append(f"{cleaned}{current_year + 1}")
|
||||||
|
elif re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}", cleaned):
|
||||||
|
day, month, year = cleaned.split(".")
|
||||||
|
candidates.append(f"{day}.{month}.20{year}")
|
||||||
|
|
||||||
|
for candidate in candidates:
|
||||||
|
try:
|
||||||
|
parsed = datetime.strptime(candidate, "%d.%m.%Y")
|
||||||
|
if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
|
||||||
|
continue
|
||||||
|
return parsed
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_named_month_date(
|
||||||
|
self,
|
||||||
|
day_value: str,
|
||||||
|
month_value: str,
|
||||||
|
year_value: str | None,
|
||||||
|
) -> datetime | None:
|
||||||
|
month = MONTH_ALIASES.get(normalize_search_text(month_value).rstrip("."))
|
||||||
|
if month is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
day = int(day_value)
|
||||||
|
current_year = datetime.utcnow().year
|
||||||
|
years = [int(year_value)] if year_value else [current_year, current_year + 1]
|
||||||
|
for year in years:
|
||||||
|
try:
|
||||||
|
parsed = datetime(year, month, day)
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if year_value or parsed.date() >= datetime.utcnow().date():
|
||||||
|
return parsed
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _find_matching_contexts(self, soup: BeautifulSoup, watch_item: WatchItem) -> list:
|
||||||
|
normalized_term = normalize_search_text(watch_item.name)
|
||||||
|
selectors = [
|
||||||
|
"li.card",
|
||||||
|
".tourplan .row",
|
||||||
|
"[class*=event]",
|
||||||
|
"[class*=termin]",
|
||||||
|
"article",
|
||||||
|
"tr",
|
||||||
|
"li",
|
||||||
|
".row",
|
||||||
|
]
|
||||||
|
candidates = []
|
||||||
|
seen_nodes = set()
|
||||||
|
|
||||||
|
for selector in selectors:
|
||||||
|
for node in soup.select(selector):
|
||||||
|
if id(node) in seen_nodes:
|
||||||
|
continue
|
||||||
|
seen_nodes.add(id(node))
|
||||||
|
text = node.get_text(" ", strip=True)
|
||||||
|
if normalized_term not in normalize_search_text(text):
|
||||||
|
continue
|
||||||
|
if len(text) > 3500:
|
||||||
|
continue
|
||||||
|
if self._find_nearest_date(text, watch_item.name):
|
||||||
|
candidates.append(node)
|
||||||
|
|
||||||
|
if candidates:
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
fallback = self._find_best_context(soup, watch_item.name)
|
||||||
|
return [fallback] if fallback is not None else []
|
||||||
|
|
||||||
|
def _find_venue(self, text: str, default: str) -> str:
|
||||||
|
lines = [line.strip() for line in re.split(r"\s{2,}|\n|\r", text) if line.strip()]
|
||||||
|
for line in lines:
|
||||||
|
normalized = normalize_search_text(line)
|
||||||
|
if "hamburg" in normalized and len(line) <= 120:
|
||||||
|
return line
|
||||||
|
return default
|
||||||
|
|
||||||
|
def _find_best_context(self, soup: BeautifulSoup, term: str):
|
||||||
|
normalized_term = normalize_search_text(term)
|
||||||
|
candidates = []
|
||||||
|
for node in soup.find_all(string=True):
|
||||||
|
if normalized_term in normalize_search_text(str(node)):
|
||||||
|
parent = node.parent
|
||||||
|
if parent is None:
|
||||||
|
continue
|
||||||
|
best_parent = self._climb_to_context_with_date(parent, term)
|
||||||
|
text = best_parent.get_text(" ", strip=True)
|
||||||
|
candidates.append(
|
||||||
|
(
|
||||||
|
0 if self._find_nearest_date(text, term) else 1,
|
||||||
|
len(text),
|
||||||
|
best_parent,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidates.sort(key=lambda item: (item[0], item[1]))
|
||||||
|
return candidates[0][2]
|
||||||
|
|
||||||
|
def _climb_to_context_with_date(self, node, term: str):
|
||||||
|
current = node
|
||||||
|
best = node
|
||||||
|
for _ in range(6):
|
||||||
|
if current is None:
|
||||||
|
break
|
||||||
|
context_text = current.get_text(" ", strip=True)
|
||||||
|
if self._find_nearest_date(context_text, term):
|
||||||
|
return current
|
||||||
|
best = current
|
||||||
|
current = current.parent
|
||||||
|
return best
|
||||||
|
|
||||||
|
def _find_title(self, soup: BeautifulSoup, term: str) -> str:
|
||||||
|
if soup is None:
|
||||||
|
return term
|
||||||
|
normalized_term = normalize_search_text(term)
|
||||||
|
for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
|
||||||
|
title = heading.get_text(" ", strip=True)
|
||||||
|
if normalized_term in normalize_search_text(title):
|
||||||
|
return title
|
||||||
|
|
||||||
|
text = soup.get_text(" ", strip=True)
|
||||||
|
dated_match = re.search(
|
||||||
|
r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
|
||||||
|
+ re.escape(term)
|
||||||
|
+ r".{0,100})",
|
||||||
|
text,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if dated_match:
|
||||||
|
return " ".join(dated_match.group(1).split())
|
||||||
|
|
||||||
|
match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
return " ".join(match.group(1).split())
|
||||||
|
return term
|
||||||
|
|
||||||
|
def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
|
||||||
|
normalized_term = normalize_search_text(term)
|
||||||
|
for link in soup.find_all("a", href=True):
|
||||||
|
if normalized_term in normalize_search_text(link.get_text(" ", strip=True)):
|
||||||
|
return urljoin(base_url, link["href"])
|
||||||
|
return None
|
||||||
Reference in New Issue
Block a user