Miss Allie Webseite

2026-05-02 17:04:07 +02:00
parent eaa5ff108c
commit ddab27799b
2 changed files with 119 additions and 0 deletions
@@ -8,3 +8,4 @@ __pycache__/
 htmlcov/
 .DS_Store
 Thumbs.db
+/backups
@@ -237,6 +237,11 @@ class SourceScanner:

        results: list[dict] = []
        seen_keys: set[str] = set()
+        line_results = self._events_from_text_lines(watch_item, source, soup, base_url)
+        if line_results:
+            self.last_message = f"{len(line_results)} passende Events aus Terminzeilen gefunden."
+            return line_results
+
        contexts_with_date = 0
        past_contexts = 0
        outside_region_contexts = 0
@@ -302,6 +307,119 @@ class SourceScanner:

        return results

+    def _events_from_text_lines(
+        self,
+        watch_item: WatchItem,
+        source: WatchSource,
+        soup: BeautifulSoup,
+        base_url: str | None,
+    ) -> list[dict]:
+        results: list[dict] = []
+        seen_keys: set[str] = set()
+        title = self._find_title(soup, watch_item.name)
+        context_url = base_url or source.url
+
+        for line in self._extract_text_lines(soup):
+            if (
+                watch_item.watch_type == WatchType.event
+                and not self._term_matches_normalized(
+                    normalize_search_text(watch_item.name),
+                    normalize_search_text(line),
+                )
+            ):
+                continue
+            event_date = self._find_nearest_date(line, watch_item.name)
+            if event_date is None:
+                continue
+            if event_date.date() < datetime.utcnow().date():
+                continue
+            if (
+                watch_item.region_scope == RegionScope.hamburg
+                and "hamburg" not in normalize_search_text(line)
+            ):
+                continue
+
+            city = self._find_city_in_line(line, watch_item.region_scope)
+            venue_name = self._find_venue_from_line(line, source.label)
+            key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
+            if key in seen_keys:
+                continue
+            seen_keys.add(key)
+
+            results.append(
+                {
+                    "external_id": key,
+                    "title": title,
+                    "matched_term": watch_item.name,
+                    "venue_name": venue_name,
+                    "city": city,
+                    "country_code": "DE",
+                    "event_date": event_date,
+                    "ticket_url": self._find_link_for_line(soup, line, context_url) or context_url,
+                    "image_url": None,
+                    "raw_payload": {
+                        "source_url": context_url,
+                        "parser": "html_line",
+                        "context": line[:1000],
+                    },
+                }
+            )
+
+        return results
+
+    def _extract_text_lines(self, soup: BeautifulSoup) -> list[str]:
+        lines = []
+        for line in soup.get_text("\n", strip=True).splitlines():
+            cleaned = " ".join(line.split())
+            if len(cleaned) < 8:
+                continue
+            if not self._find_nearest_date(cleaned, ""):
+                continue
+            lines.append(cleaned)
+        return lines
+
+    def _find_city_in_line(self, line: str, region_scope: RegionScope) -> str | None:
+        if region_scope == RegionScope.hamburg and "hamburg" in normalize_search_text(line):
+            return "Hamburg"
+
+        match = re.search(
+            r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4})?\s+([^/]+)",
+            line,
+            re.IGNORECASE,
+        )
+        if match:
+            return match.group(1).strip(" /-")
+        return None
+
+    def _find_venue_from_line(self, line: str, default: str) -> str:
+        if "/" not in line:
+            return default
+        venue = line.split("/", 1)[1].strip()
+        return venue or default
+
+    def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
+        normalized_line = normalize_search_text(line)
+        venue = normalize_search_text(self._find_venue_from_line(line, ""))
+
+        candidates = []
+        for link in soup.find_all("a", href=True):
+            link_text = link.get_text(" ", strip=True)
+            normalized_link_text = normalize_search_text(link_text)
+            if not normalized_link_text:
+                continue
+            score = 0
+            if normalized_link_text and normalized_link_text in normalized_line:
+                score += 2
+            if venue and normalized_link_text in venue:
+                score += 2
+            if score:
+                candidates.append((score, len(normalized_link_text), link["href"]))
+
+        if not candidates:
+            return None
+        candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
+        return urljoin(base_url, candidates[0][2])
+
    def _events_from_linked_pages(
        self,
        watch_item: WatchItem,