Miss Allie Webseite

This commit is contained in:
ecki
2026-05-02 17:04:07 +02:00
parent eaa5ff108c
commit ddab27799b
2 changed files with 119 additions and 0 deletions
+1
View File
@@ -8,3 +8,4 @@ __pycache__/
htmlcov/
.DS_Store
Thumbs.db
/backups
+118
View File
@@ -237,6 +237,11 @@ class SourceScanner:
results: list[dict] = []
seen_keys: set[str] = set()
line_results = self._events_from_text_lines(watch_item, source, soup, base_url)
if line_results:
self.last_message = f"{len(line_results)} passende Events aus Terminzeilen gefunden."
return line_results
contexts_with_date = 0
past_contexts = 0
outside_region_contexts = 0
@@ -302,6 +307,119 @@ class SourceScanner:
return results
def _events_from_text_lines(
self,
watch_item: WatchItem,
source: WatchSource,
soup: BeautifulSoup,
base_url: str | None,
) -> list[dict]:
results: list[dict] = []
seen_keys: set[str] = set()
title = self._find_title(soup, watch_item.name)
context_url = base_url or source.url
for line in self._extract_text_lines(soup):
if (
watch_item.watch_type == WatchType.event
and not self._term_matches_normalized(
normalize_search_text(watch_item.name),
normalize_search_text(line),
)
):
continue
event_date = self._find_nearest_date(line, watch_item.name)
if event_date is None:
continue
if event_date.date() < datetime.utcnow().date():
continue
if (
watch_item.region_scope == RegionScope.hamburg
and "hamburg" not in normalize_search_text(line)
):
continue
city = self._find_city_in_line(line, watch_item.region_scope)
venue_name = self._find_venue_from_line(line, source.label)
key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
if key in seen_keys:
continue
seen_keys.add(key)
results.append(
{
"external_id": key,
"title": title,
"matched_term": watch_item.name,
"venue_name": venue_name,
"city": city,
"country_code": "DE",
"event_date": event_date,
"ticket_url": self._find_link_for_line(soup, line, context_url) or context_url,
"image_url": None,
"raw_payload": {
"source_url": context_url,
"parser": "html_line",
"context": line[:1000],
},
}
)
return results
def _extract_text_lines(self, soup: BeautifulSoup) -> list[str]:
lines = []
for line in soup.get_text("\n", strip=True).splitlines():
cleaned = " ".join(line.split())
if len(cleaned) < 8:
continue
if not self._find_nearest_date(cleaned, ""):
continue
lines.append(cleaned)
return lines
def _find_city_in_line(self, line: str, region_scope: RegionScope) -> str | None:
if region_scope == RegionScope.hamburg and "hamburg" in normalize_search_text(line):
return "Hamburg"
match = re.search(
r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4})?\s+([^/]+)",
line,
re.IGNORECASE,
)
if match:
return match.group(1).strip(" /-")
return None
def _find_venue_from_line(self, line: str, default: str) -> str:
if "/" not in line:
return default
venue = line.split("/", 1)[1].strip()
return venue or default
def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
normalized_line = normalize_search_text(line)
venue = normalize_search_text(self._find_venue_from_line(line, ""))
candidates = []
for link in soup.find_all("a", href=True):
link_text = link.get_text(" ", strip=True)
normalized_link_text = normalize_search_text(link_text)
if not normalized_link_text:
continue
score = 0
if normalized_link_text and normalized_link_text in normalized_line:
score += 2
if venue and normalized_link_text in venue:
score += 2
if score:
candidates.append((score, len(normalized_link_text), link["href"]))
if not candidates:
return None
candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
return urljoin(base_url, candidates[0][2])
def _events_from_linked_pages(
self,
watch_item: WatchItem,