Fix PDF parser - use pdf-parse with proper CommonJS import

This commit is contained in:
Torsten Schulz (local)
2025-10-22 14:42:56 +02:00
parent 57db75e48e
commit ff138a3dc9
10 changed files with 128 additions and 132 deletions

View File

@@ -1,10 +1,10 @@
import multer from 'multer'
import fs from 'fs/promises'
import path from 'path'
import * as pdfjsLib from 'pdfjs-dist'
import { createRequire } from 'module'
// PDF.js Worker konfigurieren
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.min.js'
const require = createRequire(import.meta.url)
const pdfParse = require('pdf-parse')
// Handle both dev and production paths
const getDataPath = (filename) => {
@@ -68,22 +68,12 @@ export default defineEventHandler(async (event) => {
})
}
// PDF-Text extrahieren mit PDF.js
// PDF-Text extrahieren mit pdf-parse
const pdfBuffer = await fs.readFile(file.path)
const pdfData = await pdfjsLib.getDocument({ data: pdfBuffer }).promise
let fullText = ''
// Alle Seiten durchgehen
for (let pageNum = 1; pageNum <= pdfData.numPages; pageNum++) {
const page = await pdfData.getPage(pageNum)
const textContent = await page.getTextContent()
const pageText = textContent.items.map(item => item.str).join(' ')
fullText += pageText + '\n'
}
const pdfData = await pdfParse(pdfBuffer)
// Text in HTML-Format konvertieren
const htmlContent = convertTextToHtml(fullText)
const htmlContent = convertTextToHtml(pdfData.text)
// Config aktualisieren
const configPath = getDataPath('config.json')