I started with the pdf-parser-client-side
index.js
file and modified it as below. From K J's answer above, I found that each item has a transform array, and that its 5th element increases with each line. I used that to insert any array that could late be used to split the data into the lines I needed.
'use client';
import { pdfjs } from 'react-pdf';
pdfjs.GlobalWorkerOptions.workerSrc = `//unpkg.com/pdfjs-dist@${pdfjs.version}/build/pdf.worker.min.mjs`;
async function extractTextFromPDF(file, variant) {
try {
// Create a blob URL for the PDF file
const blobUrl = URL.createObjectURL(file);
// Load the PDF file
const loadingTask = pdfjs.getDocument(blobUrl);
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
let extractedText = '';
// Iterate through each page and extract text
for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
const page = await pdf.getPage(pageNumber);
const textContent = await page.getTextContent();
let transform = textContent.items[0].transform[5];
let pageText = [];
// insert '*!*' each time transform changes to separate lines
for (let i = 0; i < textContent.items.length; i++) {
const item = textContent.items[i];
if (item.transform[5] !== transform) {
transform = item.transform[5];
pageText.push('*!*');
pageText.push(item.str);
} else {
pageText.push(item.str);
}
}
pageText = pageText.join(' ');
return pageText;
}
console.error('Error extracting text from PDF');
// Clean up the blob URL
URL.revokeObjectURL(blobUrl);
} catch (error) {
console.error('Error extracting text from PDF:', error);
}
}
export default extractTextFromPDF;
Examples:
textContent.items[2] = {str: 'Friday', dir: 'ltr', width: 21.6, height: 6, transform: Array(6), …}
textContent.items[2].transform = [6, 0, 0, 6, 370.8004000000002, 750.226]
pageText = 'Produced Friday 09/13/24 14:22 Page No. 1 YYZ *!*...'