Mojibake bereinigt
This commit is contained in:
@@ -55,6 +55,23 @@ FOLLOW_LINK_KEYWORDS = (
|
||||
|
||||
MAX_FOLLOWED_LINKS = 6
|
||||
|
||||
MOJIBAKE_MARKERS = ("Â", "Ã", "â", *[chr(code) for code in range(0x80, 0xA0)])
|
||||
|
||||
|
||||
def clean_display_text(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
|
||||
cleaned = " ".join(str(value).split())
|
||||
if not any(marker in cleaned for marker in MOJIBAKE_MARKERS):
|
||||
return cleaned
|
||||
|
||||
try:
|
||||
repaired = cleaned.encode("latin1").decode("utf-8")
|
||||
except UnicodeError:
|
||||
return cleaned
|
||||
return " ".join(repaired.split())
|
||||
|
||||
|
||||
class SourceScanner:
|
||||
headers = {
|
||||
@@ -199,7 +216,7 @@ class SourceScanner:
|
||||
past_count = 0
|
||||
|
||||
for event in events:
|
||||
title = event.get("name") or ""
|
||||
title = clean_display_text(event.get("name"))
|
||||
performers = self._extract_performer_names(event)
|
||||
haystack = normalize_search_text(" ".join([title] + performers))
|
||||
if not self._term_matches_normalized(normalized_term, haystack):
|
||||
@@ -208,7 +225,7 @@ class SourceScanner:
|
||||
|
||||
location = event.get("location") or {}
|
||||
address = location.get("address") or {}
|
||||
city = address.get("addressLocality") or location.get("name")
|
||||
city = clean_display_text(address.get("addressLocality") or location.get("name"))
|
||||
if watch_item.region_scope == RegionScope.hamburg and normalize_search_text(city) != "hamburg":
|
||||
outside_region_count += 1
|
||||
continue
|
||||
@@ -224,7 +241,7 @@ class SourceScanner:
|
||||
"external_id": str(event.get("@id") or ticket_url or f"{source.id}:{title}"),
|
||||
"title": title or watch_item.name,
|
||||
"matched_term": watch_item.name,
|
||||
"venue_name": location.get("name") or source.label,
|
||||
"venue_name": clean_display_text(location.get("name") or source.label),
|
||||
"city": city,
|
||||
"country_code": "DE",
|
||||
"event_date": event_date,
|
||||
@@ -279,7 +296,7 @@ class SourceScanner:
|
||||
past_contexts = 0
|
||||
outside_region_contexts = 0
|
||||
for context in self._find_matching_contexts(soup, watch_item):
|
||||
context_text = context.get_text(" ", strip=True)
|
||||
context_text = clean_display_text(context.get_text(" ", strip=True))
|
||||
event_date = self._find_nearest_date(context_text, watch_item.name)
|
||||
if event_date is None:
|
||||
continue
|
||||
@@ -307,7 +324,7 @@ class SourceScanner:
|
||||
"external_id": key,
|
||||
"title": title,
|
||||
"matched_term": watch_item.name,
|
||||
"venue_name": self._find_venue(context_text, source.label),
|
||||
"venue_name": clean_display_text(self._find_venue(context_text, source.label)),
|
||||
"city": "Hamburg" if watch_item.region_scope == RegionScope.hamburg else None,
|
||||
"country_code": "DE",
|
||||
"event_date": event_date,
|
||||
@@ -316,7 +333,7 @@ class SourceScanner:
|
||||
"raw_payload": {
|
||||
"source_url": context_url,
|
||||
"parser": "html_text",
|
||||
"context": context_text[:1000],
|
||||
"context": clean_display_text(context_text[:1000]),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -372,8 +389,8 @@ class SourceScanner:
|
||||
):
|
||||
continue
|
||||
|
||||
city = self._find_city_in_line(line, watch_item.region_scope)
|
||||
venue_name = self._find_venue_from_line(line, source.label)
|
||||
city = clean_display_text(self._find_city_in_line(line, watch_item.region_scope))
|
||||
venue_name = clean_display_text(self._find_venue_from_line(line, source.label))
|
||||
key = f"{source.id}:{normalize_search_text(line)}:{event_date.date().isoformat()}"
|
||||
if key in seen_keys:
|
||||
continue
|
||||
@@ -393,7 +410,7 @@ class SourceScanner:
|
||||
"raw_payload": {
|
||||
"source_url": context_url,
|
||||
"parser": "html_line",
|
||||
"context": line[:1000],
|
||||
"context": clean_display_text(line[:1000]),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -404,7 +421,7 @@ class SourceScanner:
|
||||
lines = []
|
||||
seen_lines: set[str] = set()
|
||||
for line in soup.get_text("\n", strip=True).splitlines():
|
||||
cleaned = " ".join(line.split())
|
||||
cleaned = clean_display_text(line)
|
||||
if len(cleaned) < 8:
|
||||
continue
|
||||
if not self._find_nearest_date(cleaned, ""):
|
||||
@@ -417,7 +434,7 @@ class SourceScanner:
|
||||
for node in soup.find_all(
|
||||
["article", "li", "p", "tr", "section", "div", "span", "h1", "h2", "h3", "h4"]
|
||||
):
|
||||
cleaned = " ".join(node.get_text(" ", strip=True).split())
|
||||
cleaned = clean_display_text(node.get_text(" ", strip=True))
|
||||
if len(cleaned) < 8 or len(cleaned) > 500:
|
||||
continue
|
||||
if not self._find_nearest_date(cleaned, ""):
|
||||
@@ -438,14 +455,14 @@ class SourceScanner:
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if match:
|
||||
return match.group(1).strip(" /-")
|
||||
return clean_display_text(match.group(1).strip(" /-"))
|
||||
return None
|
||||
|
||||
def _find_venue_from_line(self, line: str, default: str) -> str:
|
||||
if "/" not in line:
|
||||
return default
|
||||
venue = line.split("/", 1)[1].strip()
|
||||
return venue or default
|
||||
return clean_display_text(default)
|
||||
venue = clean_display_text(line.split("/", 1)[1].strip())
|
||||
return venue or clean_display_text(default)
|
||||
|
||||
def _find_link_for_line(self, soup: BeautifulSoup, line: str, base_url: str) -> str | None:
|
||||
normalized_line = normalize_search_text(line)
|
||||
@@ -559,9 +576,9 @@ class SourceScanner:
|
||||
def _extract_performer_names(self, event: dict) -> list[str]:
|
||||
performer = event.get("performer") or event.get("performers")
|
||||
if isinstance(performer, dict):
|
||||
return [performer.get("name", "")]
|
||||
return [clean_display_text(performer.get("name"))]
|
||||
if isinstance(performer, list):
|
||||
return [item.get("name", "") for item in performer if isinstance(item, dict)]
|
||||
return [clean_display_text(item.get("name")) for item in performer if isinstance(item, dict)]
|
||||
return []
|
||||
|
||||
def _extract_image(self, event: dict) -> str | None:
|
||||
@@ -744,8 +761,8 @@ class SourceScanner:
|
||||
for line in lines:
|
||||
normalized = normalize_search_text(line)
|
||||
if "hamburg" in normalized and len(line) <= 120:
|
||||
return line
|
||||
return default
|
||||
return clean_display_text(line)
|
||||
return clean_display_text(default)
|
||||
|
||||
def _find_best_context(self, soup: BeautifulSoup, term: str):
|
||||
normalized_term = normalize_search_text(term)
|
||||
@@ -789,11 +806,11 @@ class SourceScanner:
|
||||
return term
|
||||
normalized_term = normalize_search_text(term)
|
||||
for heading in soup.find_all(["h1", "h2", "h3", "h4", "strong", "b", "a"]):
|
||||
title = heading.get_text(" ", strip=True)
|
||||
title = clean_display_text(heading.get_text(" ", strip=True))
|
||||
if self._term_matches_normalized(normalized_term, normalize_search_text(title)):
|
||||
return title
|
||||
|
||||
text = soup.get_text(" ", strip=True)
|
||||
text = clean_display_text(soup.get_text(" ", strip=True))
|
||||
dated_match = re.search(
|
||||
r"(.{0,40}\d{1,2}\.\d{1,2}\.(?:\d{2,4})?.{0,100}"
|
||||
+ re.escape(term)
|
||||
@@ -802,12 +819,12 @@ class SourceScanner:
|
||||
re.IGNORECASE,
|
||||
)
|
||||
if dated_match:
|
||||
return " ".join(dated_match.group(1).split())
|
||||
return clean_display_text(dated_match.group(1))
|
||||
|
||||
match = re.search(r"(.{0,80}" + re.escape(term) + r".{0,80})", text, re.IGNORECASE)
|
||||
if match:
|
||||
return " ".join(match.group(1).split())
|
||||
return term
|
||||
return clean_display_text(match.group(1))
|
||||
return clean_display_text(term)
|
||||
|
||||
def _find_nearest_link(self, soup: BeautifulSoup, term: str, base_url: str) -> str | None:
|
||||
normalized_term = normalize_search_text(term)
|
||||
|
||||
Reference in New Issue
Block a user