Mojibake bereinigt

This commit is contained in:
ecki
2026-05-02 17:39:24 +02:00
parent 18462f97e8
commit 27158071f9
+41 -24
View File
@@ -55,6 +55,23 @@ FOLLOW_LINK_KEYWORDS = (
MAX_FOLLOWED_LINKS = 6
MOJIBAKE_MARKERS = ("Â", "Ã", "â", *[chr(code) for code in range(0x80, 0xA0)])
def clean_display_text(value: str | None) -> str:
if not value:
return ""
cleaned = " ".join(str(value).split())
if not any(marker in cleaned for marker in MOJIBAKE_MARKERS):
return cleaned
try:
repaired = cleaned.encode("latin1").decode("utf-8")
except UnicodeError:
return cleaned
return " ".join(repaired.split())
class SourceScanner:
headers = {
@@ -199,7 +216,7 @@ class SourceScanner:
past_count = 0
for event in events:
title = event.get("name") or ""
title = clean_display_text(event.get("name"))
performers = self._extract_performer_names(event)
haystack = normalize_search_text(" ".join([title] + performers))
if not self._term_matches_normalized(normalized_term, haystack):
@@ -208,7 +225,7 @@ class SourceScanner:
location = event.get("location") or {}
address = location.get("address") or {}
city = address.get("addressLocality") or location.get("name")
city = clean_display_text(address.get("addressLocality") or location.get("name"))
if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
outside_region_count += 1
continue
@@ -224,7 +241,7 @@ class SourceScanner:
"external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
"title": title or watch_item.name,
"matched_term": watch_item.name,
"venue_name": location.get("name") or source.label,
"venue_name": clean_display_text(location.get("name") or source.label),
"city": city,
"country_code": "DE",
"event_date": event_date,
@@ -279,7 +296,7 @@ class SourceScanner:
past_contexts = 0
outside_region_contexts = 0
for context in self._find_matching_contexts(soup, watch_item):
context_text = context.get_text(" ", strip=True)
context_text = clean_display_text(context.get_text(" ", strip=True))
event_date = self._find_nearest_date(context_text, watch_item.name)
if event_date is None:
continue
@@ -307,7 +324,7 @@ class SourceScanner:
"external_id": key,
"title": title,
"matched_term": watch_item.name,
"venue_name": self._find_venue(context_text, source.label),
"venue_name": clean_display_text(self._find_venue(context_text, source.label)),
"city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
"country_code": "DE",
"event_date": event_date,
@@ -316,7 +333,7 @@ class SourceScanner:
"raw_payload": {
"source_url": context_url,
"parser": "html_text",
"context": context_text[:1000],
"context": clean_display_text(context_text[:1000]),
},
}
)
@@ -372,8 +389,8 @@ class SourceScanner:
):
continue
city = self._find_city_in_line(line, watch_item.region_scope)
venue_name = self._find_venue_from_line(line, source.label)
city = clean_display_text(self._find_city_in_line(line, watch_item.region_scope))
venue_name = clean_display_text(self._find_venue_from_line(line, source.label))
key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
if key in seen_keys:
continue
@@ -393,7 +410,7 @@ class SourceScanner:
"raw_payload": {
"source_url": context_url,
"parser": "html_line",
"context": line[:1000],
"context": clean_display_text(line[:1000]),
},
}
)
@@ -404,7 +421,7 @@ class SourceScanner:
lines = []
seen_lines: set[str] = set()
for line in soup.get_text("\n", strip=True).splitlines():
cleaned = " ".join(line.split())
cleaned = clean_display_text(line)
if len(cleaned) < 8:
continue
if not self._find_nearest_date(cleaned, ""):
@@ -417,7 +434,7 @@ class SourceScanner:
for node in soup.find_all(
["article", "li", "p", "tr", "section", "div", "span", "h1", "h2", "h3", "h4"]
):
cleaned = " ".join(node.get_text(" ", strip=True).split())
cleaned = clean_display_text(node.get_text(" ", strip=True))
if len(cleaned) < 8 or len(cleaned) > 500:
continue
if not self._find_nearest_date(cleaned, ""):
@@ -438,14 +455,14 @@ class SourceScanner:
re.IGNORECASE,
)
if match:
return match.group(1).strip(" /-")
return clean_display_text(match.group(1).strip(" /-"))
return None
def _find_venue_from_line(self, line: str, default: str) -> str:
if "/" not in line:
return default
venue = line.split("/", 1)[1].strip()
return venue or default
return clean_display_text(default)
venue = clean_display_text(line.split("/", 1)[1].strip())
return venue or clean_display_text(default)
def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
normalized_line = normalize_search_text(line)
@@ -559,9 +576,9 @@ class SourceScanner:
def _extract_performer_names(self, event: dict) -> list[str]:
performer = event.get("performer") or event.get("performers")
if isinstance(performer, dict):
return [performer.get("name", "")]
return [clean_display_text(performer.get("name"))]
if isinstance(performer, list):
return [item.get("name", "") for item in performer if isinstance(item, dict)]
return [clean_display_text(item.get("name")) for item in performer if isinstance(item, dict)]
return []
def _extract_image(self, event: dict) -> str | None:
@@ -744,8 +761,8 @@ class SourceScanner:
for line in lines:
normalized = normalize_search_text(line)
if "hamburg" in normalized and len(line) <= 120:
return line
return default
return clean_display_text(line)
return clean_display_text(default)
def _find_best_context(self, soup: BeautifulSoup, term: str):
normalized_term = normalize_search_text(term)
@@ -789,11 +806,11 @@ class SourceScanner:
return term
normalized_term = normalize_search_text(term)
for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
title = heading.get_text(" ", strip=True)
title = clean_display_text(heading.get_text(" ", strip=True))
if self._term_matches_normalized(normalized_term, normalize_search_text(title)):
return title
text = soup.get_text(" ", strip=True)
text = clean_display_text(soup.get_text(" ", strip=True))
dated_match = re.search(
r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
+ re.escape(term)
@@ -802,12 +819,12 @@ class SourceScanner:
re.IGNORECASE,
)
if dated_match:
return " ".join(dated_match.group(1).split())
return clean_display_text(dated_match.group(1))
match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
if match:
return " ".join(match.group(1).split())
return term
return clean_display_text(match.group(1))
return clean_display_text(term)
def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
normalized_term = normalize_search_text(term)