Enhance HTML conversion for Satzung uploads by removing page numbers and improving list handling

This commit updates the text-to-HTML conversion function to remove page numbers and footers from the extracted text. It also introduces enhanced handling for enumerated lists, allowing for better formatting of items with specific patterns (e.g., a), b), c)). These changes improve the overall quality and readability of the generated HTML content.
2026-02-06 11:58:23 +01:00
parent 32d4486123
commit 717fdf3025
1 changed files with 34 additions and 4 deletions
--- a/server/api/cms/satzung-upload.post.js
+++ b/server/api/cms/satzung-upload.post.js
@@ -167,6 +167,11 @@ function convertTextToHtml(text) {
    .replace(/\n\s*\n/g, '\n\n') // Mehrfache Zeilenumbrüche reduzieren
    .trim()
  
+  // Seitenzahlen und Seitenfuß entfernen (z.B. "Seite 2 von 4", "-2-")
+  html = html
+    .replace(/^Seite\s+\d+\s+von\s+\d+.*$/gm, '')
+    .replace(/^-+\d+-+\s*$/gm, '')
+  
  // Überschriften erkennen und formatieren
  html = html.replace(/^(Vereinssatzung|Satzung)$/gm, '<h1>$1</h1>')
  html = html.replace(/^(§\s*\d+[^§\n]*)$/gm, '<h2>$1</h2>')
@@ -181,15 +186,40 @@ function convertTextToHtml(text) {
      return paragraph
    }
    
-    // Listen erkennen
-    if (paragraph.includes('•') || paragraph.includes('-') || paragraph.match(/^\d+\./)) {
+    // Spezielle Behandlung für Aufzählungen mit a), b), c) ...
+    if (paragraph.match(/^[a-z]\)\s*$/mi)) {
+      const lines = paragraph.split('\n').map(l => l.trim()).filter(Boolean)
+      const items = []
+      let current = ''
+      
+      for (const line of lines) {
+        if (/^[a-z]\)\s*$/i.test(line)) {
+          // neuer Aufzählungspunkt, vorherigen abschließen
+          if (current) items.push(current.trim())
+          current = line
+        } else {
+          // Text zum aktuellen Aufzählungspunkt hinzufügen
+          current += (current ? ' ' : '') + line
+        }
+      }
+      if (current) items.push(current.trim())
+
+      const listItems = items.map(item => {
+        return `<li>${item}</li>`
+      }).join('')
+
+      return `<ul>${listItems}</ul>`
+    }
+
+    // Allgemeine Listen erkennen (Bullet "•", Bindestrich- oder Nummern-Listen)
+    if (paragraph.includes('•') || paragraph.match(/^[\-•]\s/m) || paragraph.match(/^\d+\.\s/m)) {
      const listItems = paragraph.split(/\n/).map(item => {
        item = item.trim()
        if (item.match(/^[•-]\s/) || item.match(/^\d+\.\s/)) {
          return `<li>${item.replace(/^[•-]\s/, '').replace(/^\d+\.\s/, '')}</li>`
        }
-        return `<li>${item}</li>`
-      }).join('')
+        return item ? `<li>${item}</li>` : ''
+      }).filter(Boolean).join('')
      return `<ul>${listItems}</ul>`
    }