Compare commits
2 Commits
eaa5ff108c
...
18462f97e8
| Author | SHA1 | Date | |
|---|---|---|---|
| 18462f97e8 | |||
| ddab27799b |
@@ -8,3 +8,4 @@ __pycache__/
|
||||
htmlcov/
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
/backups
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap"
|
||||
rel="stylesheet"
|
||||
/>
|
||||
<link rel="stylesheet" href="/static/styles.css" />
|
||||
<link rel="stylesheet" href="/static/styles.css?v=20260502" />
|
||||
</head>
|
||||
<body>
|
||||
<div class="page-shell">
|
||||
@@ -156,6 +156,6 @@
|
||||
</div>
|
||||
|
||||
<div id="toast" class="toast" aria-live="polite"></div>
|
||||
<script src="/static/app.js" defer></script>
|
||||
<script src="/static/app.js?v=20260502" defer></script>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -52,11 +52,7 @@ function formatEventDay(value) {
|
||||
return "unbekannt";
|
||||
}
|
||||
|
||||
const normalizedValue =
|
||||
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
|
||||
? `${value}Z`
|
||||
: value;
|
||||
const date = new Date(normalizedValue);
|
||||
const date = new Date(value);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
return value;
|
||||
}
|
||||
@@ -68,20 +64,32 @@ function formatEventDay(value) {
|
||||
}).format(date);
|
||||
}
|
||||
|
||||
function formatEventTime(value) {
|
||||
function formatEventTime(value, source = "") {
|
||||
if (!value) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const normalizedValue =
|
||||
typeof value === "string" && /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?$/.test(value)
|
||||
? `${value}Z`
|
||||
: value;
|
||||
const date = new Date(normalizedValue);
|
||||
if (
|
||||
source?.startsWith("source:")
|
||||
&& typeof value === "string"
|
||||
&& /T(?:00|12):00(?::00(?:\.\d+)?)?(?:Z|[+-]\d{2}:\d{2})?$/.test(value)
|
||||
) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const date = new Date(value);
|
||||
if (Number.isNaN(date.getTime())) {
|
||||
return "";
|
||||
}
|
||||
|
||||
if (
|
||||
source?.startsWith("source:")
|
||||
&& date.getMinutes() === 0
|
||||
&& (date.getHours() === 0 || date.getHours() === 12)
|
||||
) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return new Intl.DateTimeFormat("de-DE", {
|
||||
hour: "2-digit",
|
||||
minute: "2-digit",
|
||||
@@ -362,7 +370,7 @@ function renderEvents() {
|
||||
<article class="event-card">
|
||||
<div class="event-date-badge">
|
||||
<strong>${escapeHtml(formatEventDay(event.event_date))}</strong>
|
||||
<span>${escapeHtml(formatEventTime(event.event_date) || "Zeit offen")}</span>
|
||||
<span>${escapeHtml(formatEventTime(event.event_date, event.source) || "Zeit offen")}</span>
|
||||
</div>
|
||||
<div class="event-content">
|
||||
<div class="event-header">
|
||||
|
||||
+22
-1
@@ -202,6 +202,20 @@ def titles_match(left: str | None, right: str | None) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def event_has_placeholder_time(event: TrackedEvent) -> bool:
|
||||
if event.event_date is None:
|
||||
return True
|
||||
return (
|
||||
bool(event.source and event.source.startswith("source:"))
|
||||
and event.event_date.minute == 0
|
||||
and event.event_date.hour in (0, 12)
|
||||
)
|
||||
|
||||
|
||||
def event_has_specific_time(event: TrackedEvent) -> bool:
|
||||
return event.event_date is not None and not event_has_placeholder_time(event)
|
||||
|
||||
|
||||
def get_event_date_key(value: datetime | None):
|
||||
return value.date() if value else None
|
||||
|
||||
@@ -220,19 +234,26 @@ def events_are_equivalent(left: TrackedEvent, right: TrackedEvent) -> bool:
|
||||
|
||||
title_matches = titles_match(left.title, right.title)
|
||||
venue_matches = titles_match(left.venue_name, right.venue_name)
|
||||
same_matched_term = (
|
||||
normalize_event_text(left.matched_term)
|
||||
and normalize_event_text(left.matched_term) == normalize_event_text(right.matched_term)
|
||||
)
|
||||
one_has_placeholder_time = event_has_placeholder_time(left) != event_has_placeholder_time(right)
|
||||
|
||||
return title_matches or venue_matches
|
||||
return title_matches or venue_matches or (same_matched_term and one_has_placeholder_time)
|
||||
|
||||
|
||||
def is_preferred_event(candidate: TrackedEvent, current: TrackedEvent) -> bool:
|
||||
candidate_score = (
|
||||
1 if candidate.is_ticket_purchased else 0,
|
||||
1 if event_has_specific_time(candidate) else 0,
|
||||
get_provider_priority(candidate.source),
|
||||
1 if candidate.ticket_url else 0,
|
||||
candidate.last_seen_at or datetime.min,
|
||||
)
|
||||
current_score = (
|
||||
1 if current.is_ticket_purchased else 0,
|
||||
1 if event_has_specific_time(current) else 0,
|
||||
get_provider_priority(current.source),
|
||||
1 if current.ticket_url else 0,
|
||||
current.last_seen_at or datetime.min,
|
||||
|
||||
@@ -114,12 +114,21 @@ class SourceScanner:
|
||||
jsonld_events.extend(self._extract_jsonld_events(payload))
|
||||
|
||||
jsonld_results = self._events_from_jsonld(watch_item, source, jsonld_events)
|
||||
if jsonld_results:
|
||||
return jsonld_results
|
||||
|
||||
jsonld_message = self.last_message
|
||||
html_results = self._events_from_html_text(watch_item, source, soup, source_url)
|
||||
if html_results:
|
||||
return html_results
|
||||
html_message = self.last_message
|
||||
combined_results = self._dedupe_results([*jsonld_results, *html_results])
|
||||
if combined_results:
|
||||
if jsonld_results and html_results:
|
||||
self.last_message = (
|
||||
f"{len(combined_results)} passende Events aus strukturierten Daten "
|
||||
"und Terminzeilen gefunden."
|
||||
)
|
||||
elif jsonld_results:
|
||||
self.last_message = jsonld_message
|
||||
else:
|
||||
self.last_message = html_message
|
||||
return combined_results
|
||||
|
||||
if follow_links:
|
||||
linked_results = self._events_from_linked_pages(watch_item, source, soup, source_url)
|
||||
@@ -128,6 +137,30 @@ class SourceScanner:
|
||||
|
||||
return []
|
||||
|
||||
def _dedupe_results(self, results: list[dict]) -> list[dict]:
|
||||
deduped: list[dict] = []
|
||||
seen_keys: set[tuple[str, str, str, str]] = set()
|
||||
|
||||
for result in results:
|
||||
event_date = result.get("event_date")
|
||||
if isinstance(event_date, datetime):
|
||||
date_key = event_date.date().isoformat()
|
||||
else:
|
||||
date_key = str(event_date or "")
|
||||
|
||||
key = (
|
||||
date_key,
|
||||
normalize_search_text(result.get("matched_term") or result.get("title") or ""),
|
||||
normalize_search_text(result.get("venue_name") or ""),
|
||||
normalize_search_text(result.get("city") or ""),
|
||||
)
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
deduped.append(result)
|
||||
|
||||
return deduped
|
||||
|
||||
def _extract_jsonld_events(self, payload) -> list[dict]:
|
||||
events: list[dict] = []
|
||||
if isinstance(payload, list):
|
||||
@@ -237,6 +270,11 @@ class SourceScanner:
|
||||
|
||||
results: list[dict] = []
|
||||
seen_keys: set[str] = set()
|
||||
line_results = self._events_from_text_lines(watch_item, source, soup, base_url)
|
||||
if line_results:
|
||||
self.last_message = f"{len(line_results)} passende Events aus Terminzeilen gefunden."
|
||||
return line_results
|
||||
|
||||
contexts_with_date = 0
|
||||
past_contexts = 0
|
||||
outside_region_contexts = 0
|
||||
@@ -302,6 +340,136 @@ class SourceScanner:
|
||||
|
||||
return results
|
||||
|
||||
def _events_from_text_lines(
|
||||
self,
|
||||
watch_item: WatchItem,
|
||||
source: WatchSource,
|
||||
soup: BeautifulSoup,
|
||||
base_url: str | None,
|
||||
) -> list[dict]:
|
||||
results: list[dict] = []
|
||||
seen_keys: set[str] = set()
|
||||
title = self._find_title(soup, watch_item.name)
|
||||
context_url = base_url or source.url
|
||||
|
||||
for line in self._extract_text_lines(soup):
|
||||
if (
|
||||
watch_item.watch_type == WatchType.event
|
||||
and not self._term_matches_normalized(
|
||||
normalize_search_text(watch_item.name),
|
||||
normalize_search_text(line),
|
||||
)
|
||||
):
|
||||
continue
|
||||
event_date = self._find_nearest_date(line, watch_item.name)
|
||||
if event_date is None:
|
||||
continue
|
||||
if event_date.date() < datetime.utcnow().date():
|
||||
continue
|
||||
if (
|
||||
watch_item.region_scope == RegionScope.hamburg
|
||||
and "hamburg" not in normalize_search_text(line)
|
||||
):
|
||||
continue
|
||||
|
||||
city = self._find_city_in_line(line, watch_item.region_scope)
|
||||
venue_name = self._find_venue_from_line(line, source.label)
|
||||
key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"external_id": key,
|
||||
"title": title,
|
||||
"matched_term": watch_item.name,
|
||||
"venue_name": venue_name,
|
||||
"city": city,
|
||||
"country_code": "DE",
|
||||
"event_date": event_date,
|
||||
"ticket_url": self._find_link_for_line(soup, line, context_url) or context_url,
|
||||
"image_url": None,
|
||||
"raw_payload": {
|
||||
"source_url": context_url,
|
||||
"parser": "html_line",
|
||||
"context": line[:1000],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _extract_text_lines(self, soup: BeautifulSoup) -> list[str]:
|
||||
lines = []
|
||||
seen_lines: set[str] = set()
|
||||
for line in soup.get_text("\n", strip=True).splitlines():
|
||||
cleaned = " ".join(line.split())
|
||||
if len(cleaned) < 8:
|
||||
continue
|
||||
if not self._find_nearest_date(cleaned, ""):
|
||||
continue
|
||||
if cleaned in seen_lines:
|
||||
continue
|
||||
seen_lines.add(cleaned)
|
||||
lines.append(cleaned)
|
||||
|
||||
for node in soup.find_all(
|
||||
["article", "li", "p", "tr", "section", "div", "span", "h1", "h2", "h3", "h4"]
|
||||
):
|
||||
cleaned = " ".join(node.get_text(" ", strip=True).split())
|
||||
if len(cleaned) < 8 or len(cleaned) > 500:
|
||||
continue
|
||||
if not self._find_nearest_date(cleaned, ""):
|
||||
continue
|
||||
if cleaned in seen_lines:
|
||||
continue
|
||||
seen_lines.add(cleaned)
|
||||
lines.append(cleaned)
|
||||
return lines
|
||||
|
||||
def _find_city_in_line(self, line: str, region_scope: RegionScope) -> str | None:
|
||||
if region_scope == RegionScope.hamburg and "hamburg" in normalize_search_text(line):
|
||||
return "Hamburg"
|
||||
|
||||
match = re.search(
|
||||
r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4}\.?)?\s+([^/]+)",
|
||||
line,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return match.group(1).strip(" /-")
|
||||
return None
|
||||
|
||||
def _find_venue_from_line(self, line: str, default: str) -> str:
|
||||
if "/" not in line:
|
||||
return default
|
||||
venue = line.split("/", 1)[1].strip()
|
||||
return venue or default
|
||||
|
||||
def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
|
||||
normalized_line = normalize_search_text(line)
|
||||
venue = normalize_search_text(self._find_venue_from_line(line, ""))
|
||||
|
||||
candidates = []
|
||||
for link in soup.find_all("a", href=True):
|
||||
link_text = link.get_text(" ", strip=True)
|
||||
normalized_link_text = normalize_search_text(link_text)
|
||||
if not normalized_link_text:
|
||||
continue
|
||||
score = 0
|
||||
if normalized_link_text and normalized_link_text in normalized_line:
|
||||
score += 2
|
||||
if venue and normalized_link_text in venue:
|
||||
score += 2
|
||||
if score:
|
||||
candidates.append((score, len(normalized_link_text), link["href"]))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
|
||||
return urljoin(base_url, candidates[0][2])
|
||||
|
||||
def _events_from_linked_pages(
|
||||
self,
|
||||
watch_item: WatchItem,
|
||||
@@ -434,8 +602,8 @@ class SourceScanner:
|
||||
inferred_candidates: list[datetime] = []
|
||||
|
||||
for pattern in (
|
||||
r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b",
|
||||
r"\b(\d{1,2}\.\d{1,2}\.\d{2})(?!\d)",
|
||||
r"\b(\d{1,2}\.\d{1,2}\.\d{4})(?=\D|$)",
|
||||
r"\b(\d{1,2}\.\d{1,2}\.\d{2}\.?)(?=\D|$)",
|
||||
):
|
||||
for match in re.finditer(pattern, search_area):
|
||||
parsed = self._parse_german_date(match.group(1))
|
||||
@@ -493,6 +661,7 @@ class SourceScanner:
|
||||
|
||||
def _parse_german_date(self, value: str) -> datetime | None:
|
||||
cleaned = value.strip()
|
||||
cleaned = cleaned.rstrip(".") if re.fullmatch(r"\d{1,2}\.\d{1,2}\.\d{2}\.", cleaned) else cleaned
|
||||
current_year = datetime.utcnow().year
|
||||
candidates = [cleaned]
|
||||
if re.fullmatch(r"\d{1,2}\.\d{1,2}\.", cleaned):
|
||||
@@ -505,6 +674,7 @@ class SourceScanner:
|
||||
for candidate in candidates:
|
||||
try:
|
||||
parsed = datetime.strptime(candidate, "%d.%m.%Y")
|
||||
parsed = parsed.replace(hour=12)
|
||||
if parsed.date() < datetime.utcnow().date() and candidate != cleaned:
|
||||
continue
|
||||
return parsed
|
||||
|
||||
Reference in New Issue
Block a user