79625968

Date: 2025-05-16 21:48:34
Score: 0.5
Natty:
Report link

I started with the pdf-parser-client-side index.js file and modified it as below. From K J's answer above, I found that each item has a transform array, and that its 5th element increases with each line. I used that to insert any array that could late be used to split the data into the lines I needed.

'use client';

import { pdfjs } from 'react-pdf';
pdfjs.GlobalWorkerOptions.workerSrc = `//unpkg.com/pdfjs-dist@${pdfjs.version}/build/pdf.worker.min.mjs`;

async function extractTextFromPDF(file, variant) {
  try {
    // Create a blob URL for the PDF file
    const blobUrl = URL.createObjectURL(file);
    // Load the PDF file
    const loadingTask = pdfjs.getDocument(blobUrl);
    const pdf = await loadingTask.promise;
    const numPages = pdf.numPages;
    let extractedText = '';
    // Iterate through each page and extract text
    for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
      const page = await pdf.getPage(pageNumber);
      const textContent = await page.getTextContent();
      let transform = textContent.items[0].transform[5];
      let pageText = [];

      // insert '*!*' each time transform changes to separate lines
      for (let i = 0; i < textContent.items.length; i++) {
        const item = textContent.items[i];
        if (item.transform[5] !== transform) {
          transform = item.transform[5];
          pageText.push('*!*');
          pageText.push(item.str);
        } else {
          pageText.push(item.str);
        }
      }
      pageText = pageText.join(' ');
      return pageText;
    }
    console.error('Error extracting text from PDF');
    // Clean up the blob URL
    URL.revokeObjectURL(blobUrl);
  } catch (error) {
    console.error('Error extracting text from PDF:', error);
  }
}
export default extractTextFromPDF;

Examples:

textContent.items[2] = {str: 'Friday', dir: 'ltr', width: 21.6, height: 6, transform: Array(6), …}
textContent.items[2].transform = [6, 0, 0, 6, 370.8004000000002, 750.226]
pageText = 'Produced   Friday   09/13/24   14:22   Page   No.   1   YYZ *!*...'
Reasons:
  • Blacklisted phrase (0.5): I need
  • Long answer (-1):
  • Has code block (-0.5):
  • Self-answer (0.5):
  • Low reputation (1):
Posted by: wyodoodoyw