Miss Allie Webseite
This commit is contained in:
@@ -8,3 +8,4 @@ __pycache__/
|
||||
htmlcov/
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
/backups
|
||||
|
||||
@@ -237,6 +237,11 @@ class SourceScanner:
|
||||
|
||||
results: list[dict] = []
|
||||
seen_keys: set[str] = set()
|
||||
line_results = self._events_from_text_lines(watch_item, source, soup, base_url)
|
||||
if line_results:
|
||||
self.last_message = f"{len(line_results)} passende Events aus Terminzeilen gefunden."
|
||||
return line_results
|
||||
|
||||
contexts_with_date = 0
|
||||
past_contexts = 0
|
||||
outside_region_contexts = 0
|
||||
@@ -302,6 +307,119 @@ class SourceScanner:
|
||||
|
||||
return results
|
||||
|
||||
def _events_from_text_lines(
|
||||
self,
|
||||
watch_item: WatchItem,
|
||||
source: WatchSource,
|
||||
soup: BeautifulSoup,
|
||||
base_url: str | None,
|
||||
) -> list[dict]:
|
||||
results: list[dict] = []
|
||||
seen_keys: set[str] = set()
|
||||
title = self._find_title(soup, watch_item.name)
|
||||
context_url = base_url or source.url
|
||||
|
||||
for line in self._extract_text_lines(soup):
|
||||
if (
|
||||
watch_item.watch_type == WatchType.event
|
||||
and not self._term_matches_normalized(
|
||||
normalize_search_text(watch_item.name),
|
||||
normalize_search_text(line),
|
||||
)
|
||||
):
|
||||
continue
|
||||
event_date = self._find_nearest_date(line, watch_item.name)
|
||||
if event_date is None:
|
||||
continue
|
||||
if event_date.date() < datetime.utcnow().date():
|
||||
continue
|
||||
if (
|
||||
watch_item.region_scope == RegionScope.hamburg
|
||||
and "hamburg" not in normalize_search_text(line)
|
||||
):
|
||||
continue
|
||||
|
||||
city = self._find_city_in_line(line, watch_item.region_scope)
|
||||
venue_name = self._find_venue_from_line(line, source.label)
|
||||
key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
|
||||
results.append(
|
||||
{
|
||||
"external_id": key,
|
||||
"title": title,
|
||||
"matched_term": watch_item.name,
|
||||
"venue_name": venue_name,
|
||||
"city": city,
|
||||
"country_code": "DE",
|
||||
"event_date": event_date,
|
||||
"ticket_url": self._find_link_for_line(soup, line, context_url) or context_url,
|
||||
"image_url": None,
|
||||
"raw_payload": {
|
||||
"source_url": context_url,
|
||||
"parser": "html_line",
|
||||
"context": line[:1000],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _extract_text_lines(self, soup: BeautifulSoup) -> list[str]:
|
||||
lines = []
|
||||
for line in soup.get_text("\n", strip=True).splitlines():
|
||||
cleaned = " ".join(line.split())
|
||||
if len(cleaned) < 8:
|
||||
continue
|
||||
if not self._find_nearest_date(cleaned, ""):
|
||||
continue
|
||||
lines.append(cleaned)
|
||||
return lines
|
||||
|
||||
def _find_city_in_line(self, line: str, region_scope: RegionScope) -> str | None:
|
||||
if region_scope == RegionScope.hamburg and "hamburg" in normalize_search_text(line):
|
||||
return "Hamburg"
|
||||
|
||||
match = re.search(
|
||||
r"\b\d{1,2}\.\d{1,2}\.(?:\d{2,4})?\s+([^/]+)",
|
||||
line,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return match.group(1).strip(" /-")
|
||||
return None
|
||||
|
||||
def _find_venue_from_line(self, line: str, default: str) -> str:
|
||||
if "/" not in line:
|
||||
return default
|
||||
venue = line.split("/", 1)[1].strip()
|
||||
return venue or default
|
||||
|
||||
def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
|
||||
normalized_line = normalize_search_text(line)
|
||||
venue = normalize_search_text(self._find_venue_from_line(line, ""))
|
||||
|
||||
candidates = []
|
||||
for link in soup.find_all("a", href=True):
|
||||
link_text = link.get_text(" ", strip=True)
|
||||
normalized_link_text = normalize_search_text(link_text)
|
||||
if not normalized_link_text:
|
||||
continue
|
||||
score = 0
|
||||
if normalized_link_text and normalized_link_text in normalized_line:
|
||||
score += 2
|
||||
if venue and normalized_link_text in venue:
|
||||
score += 2
|
||||
if score:
|
||||
candidates.append((score, len(normalized_link_text), link["href"]))
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
candidates.sort(key=lambda item: (item[0], item[1]), reverse=True)
|
||||
return urljoin(base_url, candidates[0][2])
|
||||
|
||||
def _events_from_linked_pages(
|
||||
self,
|
||||
watch_item: WatchItem,
|
||||
|
||||
Reference in New Issue
Block a user