Update PDF parsing and document upload handling in team management

Enhanced the PDFParserService to support layout-based text extraction from PDFs using pdfjs-dist, improving parsing accuracy. Updated the team management view to streamline document uploads and parsing processes, removing unnecessary UI elements and consolidating upload logic. Improved error handling and user feedback during document processing, ensuring better user experience and clarity in case of issues.
2025-11-08 10:15:25 +01:00
parent f0e3c6a717
commit d79e71d6d7
8 changed files with 688 additions and 238 deletions
--- a/backend/services/pdfParserService.js
+++ b/backend/services/pdfParserService.js
@@ -32,19 +32,28 @@ class PDFParserService {
            // Bestimme Dateityp basierend auf Dateiendung
            const fileExtension = path.extname(filePath).toLowerCase();
            let fileContent;
+            let extractedLines = null;
+            let lineEntries = null;

            if (fileExtension === '.pdf') {
-                // Echte PDF-Parsing
-                const pdfBuffer = fs.readFileSync(filePath);
-                const pdfData = await pdfParse(pdfBuffer);
-                fileContent = pdfData.text;
+                try {
+                    const { text, lines, entries } = await this.extractPdfTextWithLayout(filePath);
+                    fileContent = text;
+                    extractedLines = lines;
+                    lineEntries = entries;
+                } catch (layoutError) {
+                    console.error('[PDFParserService.parsePDF] - Layout extraction failed, falling back to pdf-parse:', layoutError);
+                    const pdfBuffer = fs.readFileSync(filePath);
+                    const pdfData = await pdfParse(pdfBuffer);
+                    fileContent = pdfData.text;
+                }
            } else {
                // Fallback für TXT-Dateien (für Tests)
                fileContent = fs.readFileSync(filePath, 'utf8');
            }

            // Parse den Text nach Spiel-Daten
-            const parsedData = this.extractMatchData(fileContent, clubId);
+            const parsedData = this.extractMatchData(fileContent, clubId, extractedLines, lineEntries);
            
            
            return parsedData;
@@ -60,7 +69,7 @@ class PDFParserService {
     * @param {number} clubId - ID des Vereins
     * @returns {Object} Geparste Daten mit Matches und Metadaten
     */
-    static extractMatchData(text, clubId) {
+    static extractMatchData(text, clubId, providedLines = null, providedLineEntries = null) {
        const matches = [];
        const errors = [];
        const metadata = {
@@ -71,21 +80,33 @@ class PDFParserService {

        try {
            // Teile Text in Zeilen auf
-            const lines = text.split('\n').map(line => line.trim()).filter(line => line.length > 0);
+            const linesSource = providedLines && Array.isArray(providedLines) ? providedLines : text.split('\n');
+            const lines = [];
+            const filteredLineEntries = [];
+
+            linesSource.forEach((line, idx) => {
+                const trimmed = typeof line === 'string' ? line.trim() : '';
+                if (trimmed.length > 0) {
+                    lines.push(trimmed);
+                    if (providedLineEntries && Array.isArray(providedLineEntries) && providedLineEntries[idx]) {
+                        filteredLineEntries.push(providedLineEntries[idx]);
+                    }
+                }
+            });
            metadata.totalLines = lines.length;


            // Verschiedene Parsing-Strategien je nach PDF-Format
            const strategies = [
-                { name: 'Standard Format', fn: this.parseStandardFormat },
-                { name: 'Table Format', fn: this.parseTableFormat },
-                { name: 'List Format', fn: this.parseListFormat }
+                { name: 'Standard Format', fn: (lns, club, entries) => PDFParserService.parseStandardFormat(lns, club, entries) },
+                { name: 'Table Format', fn: (lns, club, entries) => PDFParserService.parseTableFormat(lns, club, entries) },
+                { name: 'List Format', fn: (lns, club, entries) => PDFParserService.parseListFormat(lns, club, entries) }
            ];

            
            for (const strategy of strategies) {
                try {
-                    const result = strategy.fn(lines, clubId);
+                    const result = strategy.fn(lines, clubId, filteredLineEntries.length === lines.length ? filteredLineEntries : null);
                    
                    if (result.matches.length > 0) {
                        console.log(`[PDF Parser] Using strategy: ${strategy.name}, found ${result.matches.length} matches`);
@@ -134,12 +155,29 @@ class PDFParserService {
     * @param {number} clubId - ID des Vereins
     * @returns {Object} Geparste Matches
     */
-    static parseStandardFormat(lines, clubId) {
+    static parseStandardFormat(lines, clubId, lineEntries = null) {
        const matches = [];
        
        
        for (let i = 0; i < lines.length; i++) {
            const line = lines[i];
+            const lineDetail = Array.isArray(lineEntries) ? lineEntries[i] : null;
+            const columnSegments = lineDetail ? this.segmentLineByPositions(lineDetail) : null;
+            let homeFromColumns = null;
+            let guestFromColumns = null;
+            let codeFromColumns = null;
+
+            if (columnSegments && columnSegments.length >= 3) {
+                homeFromColumns = columnSegments[1]?.trim() || null;
+                guestFromColumns = columnSegments[2]?.trim() || null;
+                const lastSegment = columnSegments[columnSegments.length - 1];
+                if (lastSegment) {
+                    const candidateCode = lastSegment.replace(/\s+/g, '').trim();
+                    if (/^[A-Z0-9]{12}$/.test(candidateCode)) {
+                        codeFromColumns = candidateCode;
+                    }
+                }
+            }
            
            // Suche nach Datum-Pattern (dd.mm.yyyy oder dd/mm/yyyy)
            const dateMatch = line.match(/(\d{1,2})[./](\d{1,2})[./](\d{4})/);
@@ -181,7 +219,7 @@ class PDFParserService {
                    const cleanLine3 = cleanLine2.replace(/\([^)]*\)/g, '');
                    
                    // Suche nach Code (12 Zeichen) oder PIN (4 Ziffern) am Ende
-                    const codeMatch = cleanLine3.match(/([A-Z0-9]{12})$/);
+                    let codeMatch = cleanLine3.match(/([A-Z0-9]{12})$/);
                    const pinMatch = cleanLine3.match(/(\d{4})$/);
                    
                    let code = null;
@@ -222,6 +260,11 @@ class PDFParserService {
                        }
                    }
                    
+                    if (!code && codeFromColumns) {
+                        code = codeFromColumns;
+                        teamsPart = teamsPart.replace(new RegExp(`${code}$`), '').trim();
+                    }
+
                    if (code || pinMatch) {
                        
                        
@@ -275,39 +318,89 @@ class PDFParserService {
                            // Strategie 1: Suche nach "Harheimer TC" als Heimteam oder Gastteam
                            if (teamsPart.includes('Harheimer TC')) {
                                const harheimerIndex = teamsPart.indexOf('Harheimer TC');
-                                
-                                // Prüfe, ob "Harheimer TC" am Anfang oder am Ende steht
                                let beforeHarheimer = teamsPart.substring(0, harheimerIndex).trim();
                                let afterHarheimer = teamsPart.substring(harheimerIndex + 'Harheimer TC'.length).trim();
                                
-                                // Entferne Spielnummern aus beiden Teilen
-                                beforeHarheimer = beforeHarheimer.replace(/^\d+/, '').trim();
-                                afterHarheimer = afterHarheimer.replace(/^\d+/, '').trim();
+                                beforeHarheimer = beforeHarheimer
+                                    .replace(/^\(\d+\)/, '')
+                                    .replace(/^\d+/, '')
+                                    .trim();
+                                afterHarheimer = afterHarheimer
+                                    .replace(/^\(\d+\)/, '')
+                                    .replace(/^\d+/, '')
+                                    .trim();
                                
-                                if (beforeHarheimer && !afterHarheimer) {
-                                    // "Harheimer TC" ist am Ende → Harheimer ist Gastteam
+                                const romanNumeralCandidates = ['XII', 'XI', 'X', 'IX', 'VIII', 'VII', 'VI', 'V', 'IV', 'III', 'II', 'I'];
+
+                                const matchLeadingRoman = (token) => {
+                                    if (!token) {
+                                        return null;
+                                    }
+                                    const normalizedToken = token.trim();
+                                    for (const candidate of romanNumeralCandidates) {
+                                        if (normalizedToken.startsWith(candidate)) {
+                                            const nextChar = normalizedToken.charAt(candidate.length);
+                                            if (!nextChar || /\s|[A-ZÄÖÜẞ]/.test(nextChar)) {
+                                                const remainder = normalizedToken.slice(candidate.length).trimStart();
+                                                return { roman: candidate, remainder };
+                                            }
+                                        }
+                                    }
+                                    return null;
+                                };
+
+                                const extractLeadingRomanFromTokens = (tokenList) => {
+                                    const tokensCopy = Array.isArray(tokenList) ? [...tokenList] : [];
+                                    if (tokensCopy.length === 0) {
+                                        return { roman: null, tokens: tokensCopy };
+                                    }
+
+                                    const firstToken = tokensCopy[0];
+                                    const match = matchLeadingRoman(firstToken);
+
+                                    if (match) {
+                                        const { roman, remainder } = match;
+                                        if (remainder) {
+                                            tokensCopy[0] = remainder;
+                                        } else {
+                                            tokensCopy.shift();
+                                        }
+                                        return { roman, tokens: tokensCopy };
+                                    }
+
+                                    return { roman: null, tokens: tokensCopy };
+                                };
+
+                                if (!beforeHarheimer && afterHarheimer) {
+                                    const tokens = afterHarheimer.split(/\s+/).filter(Boolean);
+                                    const { roman: homeRoman, tokens: guestTokens } = extractLeadingRomanFromTokens(tokens);
+                                    const homeSuffix = homeRoman ? ` ${homeRoman}` : '';
+                                    homeTeamName = `Harheimer TC${homeSuffix}`;
+                                    guestTeamName = guestTokens.join(' ').trim();
+                                } else if (beforeHarheimer && !afterHarheimer) {
+                                    // "Harheimer TC" ist Gastteam ohne weitere Tokens
+                                    homeTeamName = beforeHarheimer.replace(/\([^)]*\)/g, '').trim();
                                    guestTeamName = 'Harheimer TC';
-                                    homeTeamName = beforeHarheimer
-                                        .replace(/\([^)]*\)/g, '') // Entferne Klammern
-                                        .trim();
-                                } else if (!beforeHarheimer && afterHarheimer) {
-                                    // "Harheimer TC" ist am Anfang → Harheimer ist Heimteam
-                                    homeTeamName = 'Harheimer TC';
-                                    guestTeamName = afterHarheimer
-                                        .replace(/\([^)]*\)/g, '') // Entferne Klammern
-                                        .trim();
                                } else if (beforeHarheimer && afterHarheimer) {
-                                    // "Harheimer TC" ist in der Mitte → verwende Position als Hinweis
-                                    // Normalerweise: Heimteam zuerst, dann Gastteam
-                                    homeTeamName = beforeHarheimer
-                                        .replace(/\([^)]*\)/g, '') // Entferne Klammern
-                                        .trim();
-                                    guestTeamName = 'Harheimer TC';
+                                    // "Harheimer TC" steht in der Mitte → Harheimer ist Gast, Tokens nach Harheimer gehören zu ihm
+                                    homeTeamName = beforeHarheimer.replace(/\([^)]*\)/g, '').trim();
+                                    const tokens = afterHarheimer.split(/\s+/).filter(Boolean);
+                                    const { roman: guestRoman, tokens: remainingTokens } = extractLeadingRomanFromTokens(tokens);
+                                    const guestSuffix = guestRoman ? ` ${guestRoman}` : '';
+                                    guestTeamName = `Harheimer TC${guestSuffix}`;
+                                    if (remainingTokens.length > 0) {
+                                        const trailingText = remainingTokens.join(' ').trim();
+                                        if (trailingText) {
+                                            guestTeamName = `${guestTeamName} ${trailingText}`.trim();
+                                        }
+                                    }
                                } else {
-                                    // Nur "Harheimer TC" ohne andere Teams → ungültig
+                                    // Nur "Harheimer TC" ohne weitere Kontexte → überspringen
                                    continue;
                                }
-                                
+
+                                homeTeamName = homeTeamName.replace(/\([^)]*\)/g, '').trim();
+                                guestTeamName = guestTeamName.replace(/\([^)]*\)/g, '').trim();
                            } else {
                                // Strategie 2: Suche nach Großbuchstaben am Anfang des zweiten Teams
                                const teamSplitMatch = teamsPart.match(/^([A-Za-z0-9\s\-\.]+?)\s+([A-Z][A-Za-z0-9\s\-\.]+)$/);
@@ -322,6 +415,13 @@ class PDFParserService {
                            }
                        }
                        
+                        if (homeFromColumns) {
+                            homeTeamName = homeFromColumns;
+                        }
+                        if (guestFromColumns) {
+                            guestTeamName = guestFromColumns;
+                        }
+
                        if (homeTeamName && guestTeamName) {
                            let debugInfo;
                            if (code) {
@@ -358,13 +458,59 @@ class PDFParserService {
        return { matches };
    }

+    static segmentLineByPositions(lineDetail) {
+        if (!lineDetail || !Array.isArray(lineDetail.items)) {
+            return null;
+        }
+
+        const intraWordGapThreshold = 1.5;
+        const columnGapThreshold = 12;
+        const segments = [];
+
+        let currentSegment = '';
+        let previousItem = null;
+
+        lineDetail.items.forEach((item) => {
+            if (!item || typeof item.text !== 'string') {
+                return;
+            }
+            const text = item.text;
+            if (!text || text.trim().length === 0) {
+                return;
+            }
+
+            if (previousItem) {
+                const previousEnd = previousItem.x + previousItem.width;
+                const gap = item.x - previousEnd;
+
+                if (gap > columnGapThreshold) {
+                    if (currentSegment.trim().length > 0) {
+                        segments.push(currentSegment.trim());
+                    }
+                    currentSegment = '';
+                } else if (gap > intraWordGapThreshold) {
+                    currentSegment += ' ';
+                }
+            }
+
+            currentSegment += text;
+            previousItem = item;
+        });
+
+        if (currentSegment.trim().length > 0) {
+            segments.push(currentSegment.trim());
+        }
+
+        return segments.length > 0 ? segments : null;
+    }
+ 
    /**
     * Tabellen-Format Parser
     * @param {Array} lines - Textzeilen
     * @param {number} clubId - ID des Vereins
     * @returns {Object} Geparste Matches
     */
-    static parseTableFormat(lines, clubId) {
+    static parseTableFormat(lines, clubId, lineEntries = null) {
        const matches = [];
        
        // Suche nach Tabellen-Header
@@ -428,7 +574,7 @@ class PDFParserService {
     * @param {number} clubId - ID des Vereins
     * @returns {Object} Geparste Matches
     */
-    static parseListFormat(lines, clubId) {
+    static parseListFormat(lines, clubId, lineEntries = null) {
        const matches = [];
        
        for (let i = 0; i < lines.length; i++) {
@@ -559,13 +705,10 @@ class PDFParserService {
                    const matchingMatch = existingMatches.find(match => {
                        if (!match.guestTeam) return false;
                        
-                        const guestTeamName = match.guestTeam.name.toLowerCase();
-                        const searchGuestName = matchData.guestTeamName.toLowerCase();
-                        
-                        // Exakte Übereinstimmung oder Teilstring-Match
-                        return guestTeamName === searchGuestName || 
-                               guestTeamName.includes(searchGuestName) ||
-                               searchGuestName.includes(guestTeamName);
+                        const guestTeamName = match.guestTeam.name;
+                        const searchGuestName = matchData.guestTeamName;
+
+                        return PDFParserService.namesRoughlyMatch(guestTeamName, searchGuestName);
                    });
                    
                    if (matchingMatch) {
@@ -631,8 +774,7 @@ class PDFParserService {
                            // Fuzzy-Matching für Team-Namen
                            if (!homeTeam) {
                                homeTeam = allTeams.find(t => 
-                                    t.name.toLowerCase().includes(matchData.homeTeamName.toLowerCase()) ||
-                                    matchData.homeTeamName.toLowerCase().includes(t.name.toLowerCase())
+                                    PDFParserService.namesRoughlyMatch(t.name, matchData.homeTeamName)
                                );
                                
                                if (homeTeam) {
@@ -642,8 +784,7 @@ class PDFParserService {
                            
                            if (!guestTeam) {
                                guestTeam = allTeams.find(t => 
-                                    t.name.toLowerCase().includes(matchData.guestTeamName.toLowerCase()) ||
-                                    matchData.guestTeamName.toLowerCase().includes(t.name.toLowerCase())
+                                    PDFParserService.namesRoughlyMatch(t.name, matchData.guestTeamName)
                                );
                                
                                if (guestTeam) {
@@ -694,6 +835,150 @@ class PDFParserService {
            throw error;
        }
    }
+
+    static async extractPdfTextWithLayout(filePath) {
+        const { default: pdfjsLib } = await import('pdfjs-dist/legacy/build/pdf.js');
+        const pdfData = new Uint8Array(fs.readFileSync(filePath));
+        const loadingTask = pdfjsLib.getDocument({ data: pdfData, disableWorker: true });
+        const pdf = await loadingTask.promise;
+
+        const lineEntries = [];
+        const lineTolerance = 2; // Toleranz für Zeilenhöhe
+        const spaceGapThreshold = 1.5; // Mindestabstand, um ein Leerzeichen einzufügen
+
+        for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber++) {
+            const page = await pdf.getPage(pageNumber);
+            const textContent = await page.getTextContent({ normalizeWhitespace: false });
+            const pageLines = [];
+
+            textContent.items.forEach((item) => {
+                if (!item || typeof item.str !== 'string') {
+                    return;
+                }
+                const text = item.str;
+                if (!text || text.trim().length === 0) {
+                    return;
+                }
+
+                const [scaleX, , , , x, y] = item.transform;
+                const width = (item.width || 0) * (scaleX || 1);
+
+                let targetLine = pageLines.find((line) => Math.abs(line.y - y) < lineTolerance);
+                if (!targetLine) {
+                    targetLine = { y, items: [] };
+                    pageLines.push(targetLine);
+                }
+
+                targetLine.items.push({
+                    text,
+                    x,
+                    y,
+                    width
+                });
+            });
+
+            // Sortiere Zeilen von oben nach unten
+            pageLines.sort((a, b) => b.y - a.y);
+
+            pageLines.forEach((line) => {
+                // Sortiere Zeichen von links nach rechts
+                line.items.sort((a, b) => a.x - b.x);
+
+                let lineText = '';
+                let previousItem = null;
+
+                line.items.forEach((item) => {
+                    if (previousItem) {
+                        const previousEnd = previousItem.x + previousItem.width;
+                        const gap = item.x - previousEnd;
+                        if (gap > spaceGapThreshold) {
+                            lineText += ' ';
+                        }
+                    }
+
+                    lineText += item.text;
+                    previousItem = item;
+                });
+
+                const normalized = lineText.trim();
+                if (normalized.length > 0) {
+                    lineEntries.push({
+                        text: normalized,
+                        items: line.items.map((item) => ({
+                            text: item.text,
+                            x: item.x,
+                            y: item.y,
+                            width: item.width
+                        }))
+                    });
+                }
+            });
+        }
+
+        await pdf.destroy();
+
+        const lines = lineEntries.map((entry) => entry.text);
+        const text = lines.join('\n');
+        return { text, lines, entries: lineEntries };
+    }
+
+    static normalizeTeamName(name) {
+        if (!name || typeof name !== 'string') return '';
+        return name
+            .toLowerCase()
+            .replace(/\u2026/g, '...')
+            .replace(/\s+/g, ' ')
+            .trim();
+    }
+
+    static matchWithEllipsis(pattern, target) {
+        const normalizedPattern = PDFParserService.normalizeTeamName(pattern);
+        const normalizedTarget = PDFParserService.normalizeTeamName(target);
+
+        if (!normalizedPattern.includes('...')) {
+            return normalizedTarget.includes(normalizedPattern);
+        }
+
+        const segments = normalizedPattern.split('...').map(segment => segment.trim()).filter(Boolean);
+        if (segments.length === 0) {
+            return true;
+        }
+
+        let currentIndex = 0;
+        for (const segment of segments) {
+            const foundIndex = normalizedTarget.indexOf(segment, currentIndex);
+            if (foundIndex === -1) {
+                return false;
+            }
+            currentIndex = foundIndex + segment.length;
+        }
+
+        return true;
+    }
+
+    static namesRoughlyMatch(nameA, nameB) {
+        const normalizedA = PDFParserService.normalizeTeamName(nameA);
+        const normalizedB = PDFParserService.normalizeTeamName(nameB);
+
+        if (!normalizedA || !normalizedB) {
+            return false;
+        }
+
+        if (normalizedA === normalizedB) {
+            return true;
+        }
+
+        if (normalizedA.includes('...') || normalizedB.includes('...')) {
+            if (PDFParserService.matchWithEllipsis(normalizedA, normalizedB)) {
+                return true;
+            }
+            if (PDFParserService.matchWithEllipsis(normalizedB, normalizedA)) {
+                return true;
+            }
+        }
+
+        return normalizedA.includes(normalizedB) || normalizedB.includes(normalizedA);
+    }
 }

 export default PDFParserService;