117 lines
3.6 KiB
Python
117 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Deterministische Nachbearbeitung fuer Kurz-Diktate (z. B. Empfang-Notizen).
|
|
|
|
- Keine KI, kein Serverroundtrip, keine Anbindung an die Haupt-KG-/Whisper-Diktatpipeline.
|
|
- Parallele Logik fuer den Browser: ``normalizeNoteDictationText`` /
|
|
``normalizeMedicalDictationTerms`` in ``web/empfang.html`` (bei Aenderungen hier
|
|
bitte JS-Spiegel anpassen).
|
|
|
|
Wiederverwendung: Desktop oder andere lokale Diktatfelder duerfen diese Funktionen
|
|
importieren, ohne ``basis14``/KG-Code zu beruehren.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import List, Tuple
|
|
|
|
# Ganze Phrase, durch Whitespace/Wortgrenze begrenzt (kein Ersatz innerhalb Woerter).
|
|
_NOTE_DICTATION_PHRASES: Tuple[Tuple[str, str], ...] = (
|
|
("neue zeile bitte", "\n"),
|
|
("neuer absatz bitte", "\n\n"),
|
|
("neuen absatz bitte", "\n\n"),
|
|
("neue zeile", "\n"),
|
|
("neuer absatz", "\n\n"),
|
|
("neuen absatz", "\n\n"),
|
|
)
|
|
|
|
# Laenger zuerst, damit Teilstrings nicht zerteilen.
|
|
_MEDICAL_CANONICAL: Tuple[Tuple[str, str], ...] = (
|
|
("aktinische keratose", "Aktinische Keratose"),
|
|
("seborrhoische keratose", "Seborrhoische Keratose"),
|
|
("dermatoskopie", "Dermatoskopie"),
|
|
("kryotherapie", "Kryotherapie"),
|
|
("onychomykose", "Onychomykose"),
|
|
("urtikaria", "Urtikaria"),
|
|
("dermatitis", "Dermatitis"),
|
|
("histologie", "Histologie"),
|
|
("biopsie", "Biopsie"),
|
|
("exzision", "Exzision"),
|
|
("psoriasis", "Psoriasis"),
|
|
("rosazea", "Rosazea"),
|
|
("melanom", "Melanom"),
|
|
("spinaliom", "Spinaliom"),
|
|
("basaliom", "Basaliom"),
|
|
("mykose", "Mykose"),
|
|
("tinea", "Tinea"),
|
|
("ekzem", "Ekzem"),
|
|
("naevus", "Nävus"),
|
|
("nevus", "Nävus"),
|
|
("systemisch", "systemisch"),
|
|
("topisch", "topisch"),
|
|
)
|
|
|
|
|
|
def _phrase_replace_all(text: str) -> str:
|
|
s = text
|
|
for phrase, repl in _NOTE_DICTATION_PHRASES:
|
|
pat = r"(?<!\S)" + re.escape(phrase) + r"(?!\S)"
|
|
s = re.sub(pat, repl, s, flags=re.IGNORECASE)
|
|
return s
|
|
|
|
|
|
def _collapse_inline_spaces_segment(seg: str) -> str:
|
|
return re.sub(r"[ \t]+", " ", seg).strip()
|
|
|
|
|
|
def _finalize_whitespace(text: str) -> str:
|
|
lines = text.split("\n")
|
|
cleaned = [_collapse_inline_spaces_segment(line) for line in lines]
|
|
out = "\n".join(cleaned)
|
|
out = re.sub(r"\n{3,}", "\n\n", out)
|
|
return out.rstrip()
|
|
|
|
|
|
def normalize_note_dictation_text(text: str) -> str:
|
|
"""Ersetzt sichere gesprochene Steuerphrasen (Ganze Phrase, case-insensitive).
|
|
|
|
Kein ersetzen von einzelnem «Absatz» (Mehrdeutigkeit). Kein «Punkt»/«Komma».
|
|
"""
|
|
if text is None:
|
|
return ""
|
|
s = str(text).strip()
|
|
if not s:
|
|
return ""
|
|
s = _phrase_replace_all(s)
|
|
return _finalize_whitespace(s)
|
|
|
|
|
|
def _compile_medical_patterns() -> List[Tuple[re.Pattern[str], str]]:
|
|
out: List[Tuple[re.Pattern[str], str]] = []
|
|
for raw, canon in _MEDICAL_CANONICAL:
|
|
pat = r"(?<!\S)" + re.escape(raw) + r"(?!\S)"
|
|
out.append((re.compile(pat, re.IGNORECASE), canon))
|
|
return out
|
|
|
|
|
|
_MEDICAL_PATTERNS = _compile_medical_patterns()
|
|
|
|
|
|
def normalize_medical_dictation_terms(text: str) -> str:
|
|
"""Nur klare, wortumgrenzte Treffer; Canonical-Schreibweise, keine Interpretation."""
|
|
if text is None:
|
|
return ""
|
|
s = str(text)
|
|
if not s.strip():
|
|
return s
|
|
for rx, canon in _MEDICAL_PATTERNS:
|
|
s = rx.sub(canon, s)
|
|
return s
|
|
|
|
|
|
def normalize_note_dictation_pipeline(text: str) -> str:
|
|
"""Reihenfolge: Steuerbefehle -> medizinische Schreibweise -> Whitespace."""
|
|
s = normalize_note_dictation_text(text)
|
|
s = normalize_medical_dictation_terms(s)
|
|
return _finalize_whitespace(s)
|