Fix PDF parser - use pdf-parse with proper CommonJS import
This commit is contained in:
@@ -1,10 +1,10 @@
|
||||
import multer from 'multer'
|
||||
import fs from 'fs/promises'
|
||||
import path from 'path'
|
||||
import * as pdfjsLib from 'pdfjs-dist'
|
||||
import { createRequire } from 'module'
|
||||
|
||||
// PDF.js Worker konfigurieren
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.min.js'
|
||||
const require = createRequire(import.meta.url)
|
||||
const pdfParse = require('pdf-parse')
|
||||
|
||||
// Handle both dev and production paths
|
||||
const getDataPath = (filename) => {
|
||||
@@ -68,22 +68,12 @@ export default defineEventHandler(async (event) => {
|
||||
})
|
||||
}
|
||||
|
||||
// PDF-Text extrahieren mit PDF.js
|
||||
// PDF-Text extrahieren mit pdf-parse
|
||||
const pdfBuffer = await fs.readFile(file.path)
|
||||
const pdfData = await pdfjsLib.getDocument({ data: pdfBuffer }).promise
|
||||
|
||||
let fullText = ''
|
||||
|
||||
// Alle Seiten durchgehen
|
||||
for (let pageNum = 1; pageNum <= pdfData.numPages; pageNum++) {
|
||||
const page = await pdfData.getPage(pageNum)
|
||||
const textContent = await page.getTextContent()
|
||||
const pageText = textContent.items.map(item => item.str).join(' ')
|
||||
fullText += pageText + '\n'
|
||||
}
|
||||
const pdfData = await pdfParse(pdfBuffer)
|
||||
|
||||
// Text in HTML-Format konvertieren
|
||||
const htmlContent = convertTextToHtml(fullText)
|
||||
const htmlContent = convertTextToHtml(pdfData.text)
|
||||
|
||||
// Config aktualisieren
|
||||
const configPath = getDataPath('config.json')
|
||||
|
||||
Reference in New Issue
Block a user