Dupletten Logik optimiert

This commit is contained in:
ecki
2026-05-02 17:33:43 +02:00
parent ddab27799b
commit 18462f97e8
4 changed files with 104 additions and 23 deletions
+2 -2
View File
@@ -10,7 +10,7 @@
href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap"
rel="stylesheet"
/>
<link rel="stylesheet" href="/static/styles.css" />
<link rel="stylesheet" href="/static/styles.css?v=20260502" />
</head>
<body>
<div class="page-shell">
@@ -156,6 +156,6 @@
</div>
<div id="toast" class="toast" aria-live="polite"></div>
<script src="/static/app.js" defer></script>
<script src="/static/app.js?v=20260502" defer></script>
</body>
</html>
+20 -12
View File
@@ -52,11 +52,7 @@ function formatEventDay(value) {
return "unbekannt";
}
const normalizedValue =
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
? `${value}Z`
: value;
const date = new Date(normalizedValue);
const date = new Date(value);
if (Number.isNaN(date.getTime())) {
return value;
}
@@ -68,20 +64,32 @@ function formatEventDay(value) {
}).format(date);
}
function formatEventTime(value) {
function formatEventTime(value, source = "") {
if (!value) {
return "";
}
const normalizedValue =
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
? `${value}Z`
: value;
const date = new Date(normalizedValue);
if (
source?.startsWith("source:")
&& typeof value === "string"
&& /T(?:00|12):00(?::00(?:\.\d+)?)?(?:Z|[+-]\d{2}:\d{2})?$/.test(value)
) {
return "";
}
const date = new Date(value);
if (Number.isNaN(date.getTime())) {
return "";
}
if (
source?.startsWith("source:")
&& date.getMinutes() === 0
&& (date.getHours() === 0 || date.getHours() === 12)
) {
return "";
}
return new Intl.DateTimeFormat("de-DE", {
hour: "2-digit",
minute: "2-digit",
@@ -362,7 +370,7 @@ function renderEvents() {
<article class="event-card">
<div class="event-date-badge">
<strong>${escapeHtml(formatEventDay(event.event_date))}</strong>
<span>${escapeHtml(formatEventTime(event.event_date) || "Zeit offen")}</span>
<span>${escapeHtml(formatEventTime(event.event_date, event.source) || "Zeit offen")}</span>
</div>
<div class="event-content">
<div class="event-header">
+22 -1
View File
@@ -202,6 +202,20 @@ def titles_match(left: str | None, right: str | None) -> bool:
)
def event_has_placeholder_time(event: TrackedEvent) -> bool:
if event.event_date is None:
return True
return (
bool(event.source and event.source.startswith("source:"))
and event.event_date.minute == 0
and event.event_date.hour in (0, 12)
)
def event_has_specific_time(event: TrackedEvent) -> bool:
return event.event_date is not None and not event_has_placeholder_time(event)
def get_event_date_key(value: datetime | None):
return value.date() if value else None
@@ -220,19 +234,26 @@ def events_are_equivalent(left: TrackedEvent, right: TrackedEvent) -> bool:
title_matches = titles_match(left.title, right.title)
venue_matches = titles_match(left.venue_name, right.venue_name)
same_matched_term = (
normalize_event_text(left.matched_term)
and normalize_event_text(left.matched_term) == normalize_event_text(right.matched_term)
)
one_has_placeholder_time = event_has_placeholder_time(left) != event_has_placeholder_time(right)
return title_matches or venue_matches
return title_matches or venue_matches or (same_matched_term and one_has_placeholder_time)
def is_preferred_event(candidate: TrackedEvent, current: TrackedEvent) -> bool:
candidate_score = (
1 if candidate.is_ticket_purchased else 0,
1 if event_has_specific_time(candidate) else 0,
get_provider_priority(candidate.source),
1 if candidate.ticket_url else 0,
candidate.last_seen_at or datetime.min,
)
current_score = (
1 if current.is_ticket_purchased else 0,
1 if event_has_specific_time(current) else 0,
get_provider_priority(current.source),
1 if current.ticket_url else 0,
current.last_seen_at or datetime.min,
+60 -8
View File
@@ -114,12 +114,21 @@ class SourceScanner:
jsonld_events.extend(self._extract_jsonld_events(payload))
jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
if jsonld_results:
return jsonld_results
jsonld_message = self.last_message
html_results = self._events_from_html_text(watch_item, source, soup, source_url)
if html_results:
return html_results
html_message = self.last_message
combined_results = self._dedupe_results([*jsonld_results, *html_results])
if combined_results:
if jsonld_results and html_results:
self.last_message = (
f"{len(combined_results)} passende Events aus strukturierten Daten "
"und Terminzeilen gefunden."
)
elif jsonld_results:
self.last_message = jsonld_message
else:
self.last_message = html_message
return combined_results
if follow_links:
linked_results = self._events_from_linked_pages(watch_item, source, soup, source_url)
@@ -128,6 +137,30 @@ class SourceScanner:
return []
def _dedupe_results(self, results: list[dict]) -> list[dict]:
deduped: list[dict] = []
seen_keys: set[tuple[str, str, str, str]] = set()
for result in results:
event_date = result.get("event_date")
if isinstance(event_date, datetime):
date_key = event_date.date().isoformat()
else:
date_key = str(event_date or "")
key = (
date_key,
normalize_search_text(result.get("matched_term") or result.get("title") or ""),
normalize_search_text(result.get("venue_name") or ""),
normalize_search_text(result.get("city") or ""),
)
if key in seen_keys:
continue
seen_keys.add(key)
deduped.append(result)
return deduped
def _extract_jsonld_events(self, payload) -> list[dict]:
events: list[dict] = []
if isinstance(payload, list):
@@ -369,12 +402,29 @@ class SourceScanner:
def _extract_text_lines(self, soup: BeautifulSoup) -> list[str]:
lines = []
seen_lines: set[str] = set()
for line in soup.get_text("\n", strip=True).splitlines():
cleaned = " ".join(line.split())
if len(cleaned) < 8:
continue
if not self._find_nearest_date(cleaned, ""):
continue
if cleaned in seen_lines:
continue
seen_lines.add(cleaned)
lines.append(cleaned)
for node in soup.find_all(
["article", "li", "p", "tr", "section", "div", "span", "h1", "h2", "h3", "h4"]
):
cleaned = " ".join(node.get_text(" ", strip=True).split())
if len(cleaned) < 8 or len(cleaned) > 500:
continue
if not self._find_nearest_date(cleaned, ""):
continue
if cleaned in seen_lines:
continue
seen_lines.add(cleaned)
lines.append(cleaned)
return lines
@@ -383,7 +433,7 @@ class SourceScanner:
return "Hamburg"
match = re.search(
r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4})?\s+([^/]+)",
r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4}\.?)?\s+([^/]+)",
line,
re.IGNORECASE,
)
@@ -552,8 +602,8 @@ class SourceScanner:
inferred_candidates: list[datetime] = []
for pattern in (
r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
r"\b(\d{1,2}\.\d{1,2}\.\d{2})(?!\d)",
r"\b(\d{1,2}\.\d{1,2}\.\d{4})(?=\D|$)",
r"\b(\d{1,2}\.\d{1,2}\.\d{2}\.?)(?=\D|$)",
):
for match in re.finditer(pattern, search_area):
parsed = self._parse_german_date(match.group(1))
@@ -611,6 +661,7 @@ class SourceScanner:
def _parse_german_date(self, value: str) -> datetime | None:
cleaned = value.strip()
cleaned = cleaned.rstrip(".") if re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}\.", cleaned) else cleaned
current_year = datetime.utcnow().year
candidates = [cleaned]
if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
@@ -623,6 +674,7 @@ class SourceScanner:
for candidate in candidates:
try:
parsed = datetime.strptime(candidate, "%d.%m.%Y")
parsed = parsed.replace(hour=12)
if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
continue
return parsed