Webseitensuche verbessert UI aufgeräumt

This commit is contained in:
ecki
2026-04-21 13:22:20 +02:00
parent 6cfbdba0a4
commit 1014f822a1
10 changed files with 579 additions and 136 deletions
View File
+3
View File
@@ -14,6 +14,9 @@ EVENTIM_ENABLED=true
POLL_INTERVAL_HOURS=6
REMINDER_INTERVAL_HOURS=12
EVENTLENS_AUTH_USERNAME=
EVENTLENS_AUTH_PASSWORD=
SMTP_HOST=
SMTP_PORT=587
SMTP_USER=
+10 -8
View File
@@ -26,15 +26,16 @@ cp .env.example .env
docker compose up -d --build
```
Danach ist das Webfrontend lokal unter `http://127.0.0.1:8000` erreichbar.
Die Swagger-Oberflaeche liegt unter `http://127.0.0.1:8000/docs`.
API-Statusinfo findest du unter `http://127.0.0.1:8000/api`.
Danach ist das Webfrontend lokal unter `http://127.0.0.1:8001` erreichbar.
Die Swagger-Oberflaeche liegt unter `http://127.0.0.1:8001/docs`.
API-Statusinfo findest du unter `http://127.0.0.1:8001/api`.
## Wichtige Umgebungsvariablen
- `TICKETMASTER_API_KEY`: Ticketmaster Discovery API
- `BANDSINTOWN_APP_ID`: echte Bandsintown App-ID fuer Artist-Events
- `EVENTIM_ENABLED`: aktiviert den Eventim-Website-Provider
- `EVENTLENS_AUTH_USERNAME`, `EVENTLENS_AUTH_PASSWORD`: optionaler Passwortschutz fuer Webfrontend und API
- `NOTIFICATION_EMAIL_TO`: Empfaenger fuer Benachrichtigungen
- `SMTP_HOST`, `SMTP_USER`, `SMTP_PASS`: SMTP-Zugang fuer E-Mails
@@ -43,7 +44,7 @@ API-Statusinfo findest du unter `http://127.0.0.1:8000/api`.
1. Watch Item anlegen:
```bash
curl -X POST http://127.0.0.1:8000/watch-items \
curl -X POST http://127.0.0.1:8001/watch-items \
-H "Content-Type: application/json" \
-d '{
"name": "AnnenMayKantereit",
@@ -55,28 +56,29 @@ curl -X POST http://127.0.0.1:8000/watch-items \
2. Sync manuell anstossen:
```bash
curl -X POST http://127.0.0.1:8000/sync
curl -X POST http://127.0.0.1:8001/sync
```
3. Events abfragen:
```bash
curl http://127.0.0.1:8000/events
curl http://127.0.0.1:8001/events
```
4. Ticketkauf markieren:
```bash
curl -X PATCH http://127.0.0.1:8000/events/1/purchase \
curl -X PATCH http://127.0.0.1:8001/events/1/purchase \
-H "Content-Type: application/json" \
-d '{"is_ticket_purchased": true}'
```
## Hinweise fuer Debian 13 und NGINX
- NGINX kann nativ auf dem Host laufen und auf `127.0.0.1:8000` proxyen.
- NGINX kann nativ auf dem Host laufen und auf `127.0.0.1:8001` proxyen.
- Das Backend lauscht absichtlich nur auf `127.0.0.1`, damit es nicht direkt aus dem Internet erreichbar ist.
- Fuer produktiven Betrieb solltest du TLS im NGINX-Terminator aktivieren.
- Setze `EVENTLENS_AUTH_USERNAME` und `EVENTLENS_AUTH_PASSWORD`, wenn Eventlens ueber NGINX, VPN oder Tunnel erreichbar ist.
- Das Frontend wird direkt vom FastAPI-Container ausgeliefert, es ist kein Node- oder Build-Container noetig.
## Bekannte Betriebsfalle
+3
View File
@@ -25,5 +25,8 @@ class Settings:
notification_email_to = os.getenv("NOTIFICATION_EMAIL_TO", "")
smtp_starttls = os.getenv("SMTP_STARTTLS", "true").lower() == "true"
auth_username = os.getenv("EVENTLENS_AUTH_USERNAME", "")
auth_password = os.getenv("EVENTLENS_AUTH_PASSWORD", "")
settings = Settings()
+91 -46
View File
@@ -47,6 +47,47 @@ function formatDate(value) {
}).format(date);
}
function formatEventDay(value) {
if (!value) {
return "unbekannt";
}
const normalizedValue =
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
? `${value}Z`
: value;
const date = new Date(normalizedValue);
if (Number.isNaN(date.getTime())) {
return value;
}
return new Intl.DateTimeFormat("de-DE", {
day: "2-digit",
month: "short",
year: "numeric",
}).format(date);
}
function formatEventTime(value) {
if (!value) {
return "";
}
const normalizedValue =
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
? `${value}Z`
: value;
const date = new Date(normalizedValue);
if (Number.isNaN(date.getTime())) {
return "";
}
return new Intl.DateTimeFormat("de-DE", {
hour: "2-digit",
minute: "2-digit",
}).format(date);
}
function escapeHtml(value) {
return String(value ?? "")
.replaceAll("&", "&")
@@ -212,7 +253,11 @@ function renderWatchItems() {
</button>
</div>
</div>
<p>${escapeHtml(item.notes || "Keine Notiz hinterlegt.")}</p>
${
item.notes
? `<p class="watch-note">${escapeHtml(item.notes)}</p>`
: ""
}
${renderSourceList(item.id)}
<form class="source-form" data-watch-id="${item.id}">
<input
@@ -246,12 +291,9 @@ function renderSourceList(watchItemId) {
.map(
(source) => `
<div class="source-row">
<div>
<strong>${escapeHtml(source.label || "Quelle")}</strong>
<a href="${escapeHtml(source.url)}" target="_blank" rel="noreferrer">
${escapeHtml(source.url)}
</a>
<div class="pill-row">
<div class="source-content">
<div class="source-title-row">
<strong>${escapeHtml(source.label || "Quelle")}</strong>
<span class="pill ${
source.last_status === "ok"
? "success"
@@ -259,8 +301,11 @@ function renderSourceList(watchItemId) {
? "danger"
: "warning"
}">${escapeHtml(source.last_status)}</span>
<span class="muted">${escapeHtml(source.last_message || "Noch nicht gescannt.")}</span>
</div>
<a href="${escapeHtml(source.url)}" target="_blank" rel="noreferrer">
${escapeHtml(source.url)}
</a>
<p class="source-message">${escapeHtml(source.last_message || "Noch nicht gescannt.")}</p>
</div>
<div class="action-row">
<button class="action-button" data-action="toggle-source" data-id="${source.id}">
@@ -313,48 +358,48 @@ function renderEvents() {
.map(
(event) => `
<article class="event-card">
<div class="event-header">
<div>
<div class="event-date-badge">
<strong>${escapeHtml(formatEventDay(event.event_date))}</strong>
<span>${escapeHtml(formatEventTime(event.event_date) || "Zeit offen")}</span>
</div>
<div class="event-content">
<div class="event-header">
<h3>${escapeHtml(event.title)}</h3>
<div class="pill-row">
<span class="pill">${escapeHtml(getWatchNameById(event.watch_item_id))}</span>
<span class="pill">${escapeHtml(event.city || "ohne Stadt")}</span>
<span class="pill ${getProviderClass(event.source)}">
${escapeHtml(prettifyProviderName(event.source))}
</span>
<span class="pill ${event.is_ticket_purchased ? "success" : "warning"}">
${event.is_ticket_purchased ? "Ticket markiert" : "ohne Ticket"}
</span>
</div>
<div class="event-meta">
<span>${escapeHtml(getWatchNameById(event.watch_item_id))}</span>
<span>${escapeHtml(event.city || "ohne Stadt")}</span>
<span>${escapeHtml(event.venue_name || "Venue unbekannt")}</span>
<span>${escapeHtml(prettifyProviderName(event.source))}</span>
</div>
<div class="event-footer">
<span class="pill ${event.is_ticket_purchased ? "success" : "warning"}">
${event.is_ticket_purchased ? "Ticket markiert" : "ohne Ticket"}
</span>
<div class="event-actions">
${
event.ticket_url
? `<a class="action-button event-link" href="${escapeHtml(event.ticket_url)}" target="_blank" rel="noreferrer">Tickets</a>`
: ""
}
<button
class="action-button ${event.is_ticket_purchased ? "" : "success"}"
data-action="toggle-ticket"
data-id="${event.id}"
data-value="${event.is_ticket_purchased ? "false" : "true"}"
>
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
</button>
<button
class="action-button danger"
data-action="delete-event"
data-id="${event.id}"
>
Loeschen
</button>
</div>
</div>
<div class="event-actions">
<button
class="action-button ${event.is_ticket_purchased ? "" : "success"}"
data-action="toggle-ticket"
data-id="${event.id}"
data-value="${event.is_ticket_purchased ? "false" : "true"}"
>
${event.is_ticket_purchased ? "Ticket entfernen" : "Ticket gekauft"}
</button>
<button
class="action-button danger"
data-action="delete-event"
data-id="${event.id}"
>
Loeschen
</button>
</div>
</div>
<div class="event-meta">
<span><strong>Datum:</strong> ${escapeHtml(formatDate(event.event_date))}</span>
<span><strong>Venue:</strong> ${escapeHtml(event.venue_name || "unbekannt")}</span>
<span><strong>Quelle:</strong> ${escapeHtml(event.source)}</span>
</div>
${
event.ticket_url
? `<p><a class="event-link" href="${escapeHtml(event.ticket_url)}" target="_blank" rel="noreferrer">Ticketlink oeffnen</a></p>`
: ""
}
</article>
`
)
+157 -55
View File
@@ -11,10 +11,10 @@
--success: #245e3f;
--warning: #8d5a13;
--danger: #8a2f2f;
--shadow: 0 22px 70px rgba(96, 64, 24, 0.14);
--radius-lg: 28px;
--radius-md: 18px;
--radius-sm: 12px;
--shadow: 0 16px 42px rgba(72, 55, 36, 0.11);
--radius-lg: 8px;
--radius-md: 8px;
--radius-sm: 8px;
--mono: "IBM Plex Mono", monospace;
--sans: "Space Grotesk", sans-serif;
}
@@ -29,8 +29,6 @@ body {
font-family: var(--sans);
color: var(--text);
background:
radial-gradient(circle at top left, rgba(194, 77, 44, 0.18), transparent 28%),
radial-gradient(circle at 85% 18%, rgba(36, 94, 63, 0.14), transparent 22%),
linear-gradient(135deg, var(--bg), #f7f2ea 48%, var(--bg-accent));
}
@@ -49,14 +47,14 @@ body::before {
.page-shell {
width: min(1280px, calc(100vw - 32px));
margin: 0 auto;
padding: 32px 0 56px;
padding: 24px 0 44px;
}
.hero {
display: grid;
grid-template-columns: 1.6fr 1fr;
gap: 24px;
margin-bottom: 24px;
gap: 16px;
margin-bottom: 16px;
}
.hero-copy,
@@ -69,7 +67,7 @@ body::before {
}
.hero-copy {
padding: 36px;
padding: 28px;
border-radius: var(--radius-lg);
}
@@ -93,36 +91,36 @@ p {
}
h1 {
font-size: clamp(2.5rem, 5vw, 4.9rem);
line-height: 0.95;
max-width: 11ch;
font-size: clamp(2.2rem, 4vw, 3.8rem);
line-height: 1;
max-width: 14ch;
}
.hero-text {
max-width: 58ch;
margin-top: 18px;
font-size: 1.05rem;
line-height: 1.6;
margin-top: 14px;
font-size: 1rem;
line-height: 1.5;
color: var(--muted);
}
.hero-actions {
display: flex;
flex-wrap: wrap;
gap: 12px;
margin-top: 26px;
gap: 10px;
margin-top: 20px;
}
.status-panel {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 14px;
padding: 20px;
gap: 10px;
padding: 14px;
border-radius: var(--radius-lg);
}
.stat-card {
padding: 18px;
padding: 14px;
background: rgba(255, 250, 242, 0.86);
border-radius: var(--radius-md);
border: 1px solid rgba(46, 39, 30, 0.08);
@@ -130,7 +128,7 @@ h1 {
.stat-card strong {
display: block;
font-size: 2rem;
font-size: 1.7rem;
line-height: 1.1;
}
@@ -151,12 +149,12 @@ h1 {
.dashboard-grid {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 24px;
grid-template-columns: minmax(360px, 0.9fr) minmax(420px, 1.1fr);
gap: 16px;
}
.panel {
padding: 24px;
padding: 18px;
border-radius: var(--radius-lg);
}
@@ -168,8 +166,8 @@ h1 {
display: flex;
align-items: start;
justify-content: space-between;
gap: 18px;
margin-bottom: 18px;
gap: 14px;
margin-bottom: 14px;
}
.panel-tools {
@@ -180,8 +178,8 @@ h1 {
.watch-form {
display: grid;
grid-template-columns: repeat(2, minmax(0, 1fr));
gap: 14px;
margin-bottom: 20px;
gap: 10px;
margin-bottom: 16px;
}
.full-width {
@@ -206,7 +204,7 @@ input,
select,
textarea {
width: 100%;
padding: 14px 16px;
padding: 10px 12px;
border-radius: var(--radius-sm);
border: 1px solid rgba(46, 39, 30, 0.16);
background: rgba(255, 255, 255, 0.88);
@@ -227,8 +225,8 @@ button,
display: inline-flex;
align-items: center;
justify-content: center;
min-height: 48px;
padding: 0 18px;
min-height: 40px;
padding: 0 14px;
border: 0;
border-radius: 999px;
text-decoration: none;
@@ -258,11 +256,11 @@ button:hover,
.notification-list,
.provider-status-list {
display: grid;
gap: 14px;
gap: 10px;
}
.empty-state {
padding: 28px;
padding: 20px;
border-radius: var(--radius-md);
border: 1px dashed rgba(46, 39, 30, 0.18);
color: var(--muted);
@@ -292,7 +290,7 @@ button:hover,
display: flex;
align-items: start;
justify-content: space-between;
gap: 16px;
gap: 12px;
}
.pill-row,
@@ -302,14 +300,14 @@ button:hover,
.notification-meta {
display: flex;
flex-wrap: wrap;
gap: 8px;
gap: 6px;
}
.pill {
display: inline-flex;
align-items: center;
min-height: 28px;
padding: 0 10px;
min-height: 24px;
padding: 0 8px;
border-radius: 999px;
background: rgba(194, 77, 44, 0.09);
color: var(--primary-dark);
@@ -377,47 +375,74 @@ button:hover,
.event-card p,
.notification-card p,
.provider-status-card p {
margin-top: 12px;
line-height: 1.55;
margin-top: 8px;
line-height: 1.45;
}
.watch-note {
color: var(--muted);
}
.source-list {
display: grid;
gap: 10px;
margin-top: 16px;
gap: 8px;
margin-top: 12px;
}
.source-row {
display: flex;
display: grid;
grid-template-columns: minmax(0, 1fr) auto;
align-items: start;
justify-content: space-between;
gap: 16px;
padding: 14px;
gap: 12px;
padding: 10px;
border-radius: var(--radius-sm);
background: rgba(46, 39, 30, 0.05);
}
.source-content {
min-width: 0;
}
.source-title-row {
display: flex;
align-items: center;
gap: 8px;
}
.source-row a {
display: block;
max-width: 46ch;
margin: 4px 0 8px;
max-width: 100%;
margin: 4px 0;
overflow-wrap: anywhere;
color: var(--primary-dark);
font-size: 0.9rem;
}
.source-message {
margin: 0;
color: var(--muted);
font-size: 0.9rem;
}
.source-form {
display: grid;
grid-template-columns: 0.7fr 1.4fr auto;
gap: 10px;
margin-top: 14px;
gap: 8px;
margin-top: 10px;
}
.action-button {
min-height: 38px;
padding: 0 14px;
display: inline-flex;
align-items: center;
justify-content: center;
min-height: 32px;
padding: 0 10px;
border-radius: 999px;
background: rgba(46, 39, 30, 0.08);
color: var(--text);
text-decoration: none;
white-space: nowrap;
font-size: 0.92rem;
}
.action-button.danger {
@@ -430,16 +455,67 @@ button:hover,
color: var(--success);
}
.event-card {
display: grid;
grid-template-columns: 112px minmax(0, 1fr);
gap: 14px;
padding: 14px;
}
.event-date-badge {
display: grid;
align-content: center;
min-height: 86px;
padding: 10px;
border-radius: var(--radius-sm);
background: rgba(194, 77, 44, 0.1);
color: var(--primary-dark);
}
.event-date-badge strong {
font-size: 1rem;
line-height: 1.15;
}
.event-date-badge span {
margin-top: 4px;
color: var(--muted);
font-family: var(--mono);
font-size: 0.78rem;
}
.event-content {
min-width: 0;
}
.event-header h3 {
overflow-wrap: anywhere;
}
.event-meta,
.notification-meta {
margin-top: 14px;
margin-top: 8px;
color: var(--muted);
font-size: 0.94rem;
font-size: 0.9rem;
}
.event-meta span:not(:last-child)::after {
content: "/";
margin-left: 8px;
color: rgba(101, 89, 77, 0.52);
}
.event-footer {
display: flex;
align-items: center;
justify-content: space-between;
gap: 10px;
margin-top: 12px;
}
.event-link {
color: var(--primary-dark);
text-decoration-thickness: 2px;
background: rgba(194, 77, 44, 0.09);
}
.toast {
@@ -514,4 +590,30 @@ button:hover,
.notification-header {
flex-direction: column;
}
.source-row,
.event-card {
grid-template-columns: 1fr;
}
.event-date-badge {
min-height: auto;
}
.event-footer,
.source-title-row {
align-items: flex-start;
flex-direction: column;
}
.event-actions,
.action-row,
.panel-tools {
width: 100%;
}
.event-actions .action-button,
.action-row .action-button {
flex: 1 1 140px;
}
}
+30 -1
View File
@@ -1,8 +1,10 @@
import base64
import secrets
from pathlib import Path
from datetime import datetime
from fastapi import Depends, FastAPI, HTTPException
from fastapi.responses import FileResponse
from fastapi.responses import FileResponse, Response
from fastapi.staticfiles import StaticFiles
from sqlalchemy.orm import Session
@@ -44,6 +46,33 @@ static_dir = frontend_dir / "static"
app.mount("/static", StaticFiles(directory=static_dir), name="static")
@app.middleware("http")
async def require_basic_auth(request, call_next):
if not settings.auth_username or not settings.auth_password:
return await call_next(request)
authorization = request.headers.get("authorization", "")
scheme, _, credentials = authorization.partition(" ")
if scheme.lower() == "basic" and credentials:
try:
decoded = base64.b64decode(credentials).decode("utf-8")
except (ValueError, UnicodeDecodeError):
decoded = ""
username, separator, password = decoded.partition(":")
if (
separator
and secrets.compare_digest(username, settings.auth_username)
and secrets.compare_digest(password, settings.auth_password)
):
return await call_next(request)
return Response(
status_code=401,
headers={"WWW-Authenticate": 'Basic realm="eventlens"'},
)
@app.on_event("startup")
def startup():
Base.metadata.create_all(bind=engine)
+38 -4
View File
@@ -42,7 +42,7 @@ def list_watch_items(db: Session) -> list[WatchItem]:
def list_events(db: Session) -> list[TrackedEvent]:
events = list(db.scalars(select(TrackedEvent).order_by(desc(TrackedEvent.event_date))))
events = list(db.scalars(select(TrackedEvent)))
deduped: list[TrackedEvent] = []
for event in events:
@@ -61,7 +61,16 @@ def list_events(db: Session) -> list[TrackedEvent]:
if is_preferred_event(event, deduped[duplicate_index]):
deduped[duplicate_index] = event
return deduped
return sorted(deduped, key=event_sort_key)
def event_sort_key(event: TrackedEvent):
today = datetime.utcnow().date()
if event.event_date is None:
return (1, datetime.max)
if event.event_date.date() >= today:
return (0, event.event_date)
return (2, datetime.max - (event.event_date - datetime.min))
def list_notifications(db: Session):
@@ -269,6 +278,27 @@ def upsert_event(
return tracked_event, is_new
def prune_stale_source_events(
db: Session,
watch_item: WatchItem,
source_name: str,
seen_external_ids: set[str],
) -> int:
stmt = select(TrackedEvent).where(
TrackedEvent.watch_item_id == watch_item.id,
TrackedEvent.source == source_name,
TrackedEvent.is_ticket_purchased.is_(False),
)
stale_events = [
event
for event in db.scalars(stmt)
if event.external_id not in seen_external_ids
]
for event in stale_events:
db.delete(event)
return len(stale_events)
def run_sync(db: Session) -> SyncResult:
providers = get_providers()
source_scanner = SourceScanner()
@@ -288,12 +318,14 @@ def run_sync(db: Session) -> SyncResult:
for watch_item in active_items:
active_sources = [source for source in watch_item.sources if source.is_active]
for source in active_sources:
source_name = f"source:{source.id}"
seen_source_event_ids: set[str] = set()
try:
events = source_scanner.scan(watch_item, source)
source.last_status = (
SourceStatusType.ok if events else SourceStatusType.no_match
)
source.last_message = (
source.last_message = source_scanner.last_message or (
f"{len(events)} passende Events gefunden."
if events
else "Keine passenden Events auf dieser Quelle gefunden."
@@ -317,9 +349,10 @@ def run_sync(db: Session) -> SyncResult:
tracked_event, is_new = upsert_event(
db=db,
watch_item=watch_item,
provider_name=f"source:{source.id}",
provider_name=source_name,
event_data=event_data,
)
seen_source_event_ids.add(tracked_event.external_id)
if is_new:
new_events += 1
else:
@@ -353,6 +386,7 @@ def run_sync(db: Session) -> SyncResult:
notifications_skipped += 1
db.add(source)
prune_stale_source_events(db, watch_item, source_name, seen_source_event_ids)
db.commit()
for provider in providers:
+246 -21
View File
@@ -1,8 +1,9 @@
import json
import re
from datetime import datetime
from difflib import SequenceMatcher
from html import unescape
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
@@ -40,6 +41,20 @@ MONTH_ALIASES = {
"dezember": 12,
}
FOLLOW_LINK_KEYWORDS = (
"event",
"gig",
"konzert",
"live",
"rausgegangen",
"show",
"termin",
"ticket",
"tour",
)
MAX_FOLLOWED_LINKS = 6
class SourceScanner:
headers = {
@@ -48,7 +63,11 @@ class SourceScanner:
"Accept-Language": "de-DE,de;q=0.9,en;q=0.7",
}
def __init__(self):
self.last_message = ""
def scan(self, watch_item: WatchItem, source: WatchSource) -> list[dict]:
self.last_message = ""
response = requests.get(
source.url,
headers=self.headers,
@@ -58,16 +77,30 @@ class SourceScanner:
content_type = response.headers.get("content-type", "")
if "application/json" in content_type:
return self._scan_json(watch_item, source, response.json())
results = self._scan_json(watch_item, source, response.json())
if not self.last_message:
self.last_message = self._build_scan_message(results, "JSON-Daten ausgewertet.")
return results
return self._scan_html(watch_item, source, response.text)
results = self._scan_html(watch_item, source, response.text, response.url)
if not self.last_message:
self.last_message = self._build_scan_message(results, "HTML-Seite ausgewertet.")
return results
def _scan_json(self, watch_item: WatchItem, source: WatchSource, payload) -> list[dict]:
events = self._extract_jsonld_events(payload)
return self._events_from_jsonld(watch_item, source, events)
def _scan_html(self, watch_item: WatchItem, source: WatchSource, html: str) -> list[dict]:
def _scan_html(
self,
watch_item: WatchItem,
source: WatchSource,
html: str,
base_url: str | None = None,
follow_links: bool = True,
) -> list[dict]:
soup = BeautifulSoup(html, "html.parser")
source_url = base_url or source.url
jsonld_events = []
for script in soup.find_all("script", type="application/ld+json"):
@@ -84,7 +117,16 @@ class SourceScanner:
if jsonld_results:
return jsonld_results
return self._events_from_html_text(watch_item, source, soup)
html_results = self._events_from_html_text(watch_item, source, soup, source_url)
if html_results:
return html_results
if follow_links:
linked_results = self._events_from_linked_pages(watch_item, source, soup, source_url)
if linked_results:
return linked_results
return []
def _extract_jsonld_events(self, payload) -> list[dict]:
events: list[dict] = []
@@ -119,22 +161,28 @@ class SourceScanner:
) -> list[dict]:
results: list[dict] = []
normalized_term = normalize_search_text(watch_item.name)
matching_name_count = 0
outside_region_count = 0
past_count = 0
for event in events:
title = event.get("name") or ""
performers = self._extract_performer_names(event)
haystack = normalize_search_text(" ".join([title] + performers))
if normalized_term not in haystack:
if not self._term_matches_normalized(normalized_term, haystack):
continue
matching_name_count += 1
location = event.get("location") or {}
address = location.get("address") or {}
city = address.get("addressLocality") or location.get("name")
if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
outside_region_count += 1
continue
event_date = self._parse_datetime(event.get("startDate"))
if event_date and event_date.date() < datetime.utcnow().date():
past_count += 1
continue
ticket_url = event.get("url") or source.url
@@ -153,6 +201,22 @@ class SourceScanner:
}
)
if results:
self.last_message = f"{len(results)} passende Events in strukturierten Daten gefunden."
elif outside_region_count:
self.last_message = (
f"Strukturierte Daten gefunden, aber kein Termin fuer '{watch_item.name}' "
"in Hamburg."
)
elif past_count:
self.last_message = (
f"Strukturierte Daten gefunden, aber nur vergangene Termine fuer '{watch_item.name}'."
)
elif matching_name_count:
self.last_message = (
f"Strukturierte Daten gefunden, aber keine passenden Termine fuer '{watch_item.name}'."
)
return results
def _events_from_html_text(
@@ -160,30 +224,41 @@ class SourceScanner:
watch_item: WatchItem,
source: WatchSource,
soup: BeautifulSoup,
base_url: str | None = None,
) -> list[dict]:
text = soup.get_text(" ", strip=True)
normalized_text = normalize_search_text(text)
normalized_term = normalize_search_text(watch_item.name)
if normalized_term not in normalized_text:
if not self._term_matches_normalized(normalized_term, normalized_text):
self.last_message = (
f"Seite erreichbar, aber der Name '{watch_item.name}' wurde nicht gefunden."
)
return []
results: list[dict] = []
seen_keys: set[str] = set()
contexts_with_date = 0
past_contexts = 0
outside_region_contexts = 0
for context in self._find_matching_contexts(soup, watch_item):
context_text = context.get_text(" ", strip=True)
event_date = self._find_nearest_date(context_text, watch_item.name)
if event_date is None:
continue
contexts_with_date += 1
if event_date.date() < datetime.utcnow().date():
past_contexts += 1
continue
if (
watch_item.region_scope == RegionScope.hamburg
and "hamburg" not in normalize_search_text(context_text)
):
outside_region_contexts += 1
continue
title = self._find_title(context, watch_item.name)
link = self._find_nearest_link(context, watch_item.name, source.url) or source.url
context_url = base_url or source.url
link = self._find_nearest_link(context, watch_item.name, context_url) or context_url
key = f"{source.id}:{normalize_search_text(title)}:{event_date.date().isoformat()}"
if key in seen_keys:
continue
@@ -201,15 +276,118 @@ class SourceScanner:
"ticket_url": link,
"image_url": None,
"raw_payload": {
"source_url": source.url,
"source_url": context_url,
"parser": "html_text",
"context": context_text[:1000],
},
}
)
if results:
self.last_message = f"{len(results)} passende Events gefunden."
elif outside_region_contexts:
self.last_message = (
f"Seite erreichbar, Termine fuer '{watch_item.name}' gefunden, "
"aber keiner in Hamburg."
)
elif past_contexts:
self.last_message = (
f"Seite erreichbar, aber nur vergangene Termine fuer '{watch_item.name}' gefunden."
)
elif contexts_with_date == 0:
self.last_message = (
f"Seite erreichbar, Name '{watch_item.name}' gefunden, "
"aber keine auswertbaren Termine."
)
return results
def _events_from_linked_pages(
self,
watch_item: WatchItem,
source: WatchSource,
soup: BeautifulSoup,
base_url: str,
) -> list[dict]:
results: list[dict] = []
seen_event_keys: set[str] = set()
checked_links = 0
for link_url in self._candidate_follow_links(soup, base_url):
try:
response = requests.get(link_url, headers=self.headers, timeout=30)
response.raise_for_status()
except requests.RequestException:
continue
checked_links += 1
content_type = response.headers.get("content-type", "")
if "application/json" in content_type:
linked_results = self._scan_json(watch_item, source, response.json())
elif "text/html" in content_type or "application/xhtml+xml" in content_type or not content_type:
linked_results = self._scan_html(
watch_item,
source,
response.text,
response.url,
follow_links=False,
)
else:
continue
for event in linked_results:
key = event["external_id"]
if key in seen_event_keys:
continue
seen_event_keys.add(key)
results.append(event)
if results:
self.last_message = f"{len(results)} passende Events auf verlinkten Seiten gefunden."
elif checked_links:
self.last_message = (
f"Seite erreichbar, {checked_links} relevante Links geprueft, "
"aber keine passenden Events gefunden."
)
return results
def _build_scan_message(self, results: list[dict], fallback: str) -> str:
if results:
return f"{len(results)} passende Events gefunden."
return f"{fallback} Keine passenden Events gefunden."
def _candidate_follow_links(self, soup: BeautifulSoup, base_url: str) -> list[str]:
base_host = urlparse(base_url).netloc.lower()
scored_links: list[tuple[int, str]] = []
seen_urls: set[str] = set()
for link in soup.find_all("a", href=True):
href = link["href"].strip()
if not href or href.startswith(("#", "mailto:", "tel:", "javascript:")):
continue
link_url = urljoin(base_url, href)
if link_url in seen_urls or link_url.rstrip("/") == base_url.rstrip("/"):
continue
seen_urls.add(link_url)
parsed = urlparse(link_url)
if parsed.scheme not in {"http", "https"}:
continue
link_text = link.get_text(" ", strip=True)
haystack = normalize_search_text(" ".join([link_text, href, parsed.netloc, parsed.path]))
keyword_hits = sum(1 for keyword in FOLLOW_LINK_KEYWORDS if keyword in haystack)
if keyword_hits == 0:
continue
same_host_bonus = 1 if parsed.netloc.lower() == base_host else 0
scored_links.append((keyword_hits + same_host_bonus, link_url))
scored_links.sort(key=lambda item: item[0], reverse=True)
return [link_url for _, link_url in scored_links[:MAX_FOLLOWED_LINKS]]
def _extract_performer_names(self, event: dict) -> list[str]:
performer = event.get("performer") or event.get("performers")
if isinstance(performer, dict):
@@ -252,16 +430,22 @@ class SourceScanner:
end = min(len(text), term_index + 500)
search_area = text[start:end]
candidates: list[datetime] = []
explicit_candidates: list[datetime] = []
inferred_candidates: list[datetime] = []
for pattern in (
r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
r"\b(\d{1,2}\.\d{1,2}\.\d{2})\b",
r"\b(\d{1,2}\.\d{1,2}\.)\b",
r"\b(\d{1,2}\.\d{1,2}\.\d{2})(?!\d)",
):
for match in re.finditer(pattern, search_area):
parsed = self._parse_german_date(match.group(1))
if parsed:
candidates.append(parsed)
explicit_candidates.append(parsed)
for match in re.finditer(r"\b(\d{1,2}\.\d{1,2}\.)(?!\d)", search_area):
parsed = self._parse_german_date(match.group(1))
if parsed:
inferred_candidates.append(parsed)
month_name_pattern = (
r"jan(?:uar)?|feb(?:ruar)?|m(?:ae|ä)r(?:z)?|apr(?:il)?|mai|jun(?:i)?|"
@@ -274,7 +458,10 @@ class SourceScanner:
):
parsed = self._parse_named_month_date(match.group(1), match.group(2), match.group(3))
if parsed:
candidates.append(parsed)
if match.group(3):
explicit_candidates.append(parsed)
else:
inferred_candidates.append(parsed)
for match in re.finditer(
rf"\b({month_name_pattern})\.?\s+(\d{{1,2}})\.?\s*(\d{{4}})?\b",
search_area,
@@ -282,14 +469,27 @@ class SourceScanner:
):
parsed = self._parse_named_month_date(match.group(2), match.group(1), match.group(3))
if parsed:
candidates.append(parsed)
if match.group(3):
explicit_candidates.append(parsed)
else:
inferred_candidates.append(parsed)
if explicit_candidates:
future_explicit_candidates = [
candidate
for candidate in explicit_candidates
if candidate.date() >= datetime.utcnow().date()
]
if future_explicit_candidates:
return sorted(future_explicit_candidates)[0]
return sorted(explicit_candidates)[0]
future_candidates = [
candidate for candidate in candidates if candidate.date() >= datetime.utcnow().date()
candidate for candidate in inferred_candidates if candidate.date() >= datetime.utcnow().date()
]
if future_candidates:
return sorted(future_candidates)[0]
return sorted(candidates)[0] if candidates else None
return sorted(inferred_candidates)[0] if inferred_candidates else None
def _parse_german_date(self, value: str) -> datetime | None:
cleaned = value.strip()
@@ -342,6 +542,7 @@ class SourceScanner:
"[class*=event]",
"[class*=termin]",
"article",
"a[href]",
"tr",
"li",
".row",
@@ -355,7 +556,7 @@ class SourceScanner:
continue
seen_nodes.add(id(node))
text = node.get_text(" ", strip=True)
if normalized_term not in normalize_search_text(text):
if not self._term_matches_normalized(normalized_term, normalize_search_text(text)):
continue
if len(text) > 3500:
continue
@@ -380,7 +581,7 @@ class SourceScanner:
normalized_term = normalize_search_text(term)
candidates = []
for node in soup.find_all(string=True):
if normalized_term in normalize_search_text(str(node)):
if self._term_matches_normalized(normalized_term, normalize_search_text(str(node))):
parent = node.parent
if parent is None:
continue
@@ -419,7 +620,7 @@ class SourceScanner:
normalized_term = normalize_search_text(term)
for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
title = heading.get_text(" ", strip=True)
if normalized_term in normalize_search_text(title):
if self._term_matches_normalized(normalized_term, normalize_search_text(title)):
return title
text = soup.get_text(" ", strip=True)
@@ -441,6 +642,30 @@ class SourceScanner:
def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
normalized_term = normalize_search_text(term)
for link in soup.find_all("a", href=True):
if normalized_term in normalize_search_text(link.get_text(" ", strip=True)):
if self._term_matches_normalized(
normalized_term,
normalize_search_text(link.get_text(" ", strip=True)),
):
return urljoin(base_url, link["href"])
return None
def _term_matches_normalized(self, normalized_term: str, normalized_text: str) -> bool:
if not normalized_term or not normalized_text:
return False
if normalized_term in normalized_text:
return True
if len(normalized_term) < 8:
return False
term_tokens = normalized_term.split()
text_tokens = normalized_text.split()
if not term_tokens or len(text_tokens) < len(term_tokens):
return False
window_size = len(term_tokens)
for index in range(len(text_tokens) - window_size + 1):
candidate = " ".join(text_tokens[index : index + window_size])
if SequenceMatcher(None, normalized_term, candidate).ratio() >= 0.9:
return True
return False
+1 -1
View File
@@ -3,7 +3,7 @@ server {
server_name eventlens.example.com;
location / {
proxy_pass http://127.0.0.1:8000;
proxy_pass http://127.0.0.1:8001;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;