How to Extract data from pdf file in nodejs

Question

I have already used 'pdf.js-extract' npm module to fetch data from pdf.

var PDFExtract = require('pdf.js-extract').PDFExtract;

var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"

pdfExtract.extract(filename , function (err, data) {
    if (err) return console.log(err);
    console.log(JSON.stringify(data));
});

But I am not getting the desired result. I want to fetch the relevant information from invoice pdf like tax, total amount paid, seller address and save the data fetched into the mongodb collection

It will always be the same invoice format (positions)? Can you show us the return of console.log(JSON.stringify(data));? — Liberateur
– Liberateur, Commented Aug 22, 2018 at 8:02
Actually the pdf is generated by the marketplace likes amozon , flipkart etc so it can be different in format. — Ayushi Gupta
– Ayushi Gupta, Commented Aug 22, 2018 at 8:11
The module will extract the data, but they will have a different format ... So either create a function by type of invoice that will take the data as on the base template, or create a script that will recognize the content ... — Liberateur
– Liberateur, Commented Aug 22, 2018 at 8:18
Send an example of what you are getting console.log(JSON.stringify(data)); — Liberateur
– Liberateur, Commented Aug 22, 2018 at 8:18
the response i am getting is { "pages": [ { "content": [ { "x": 348.41, "y": 125.59899999999993, "str": "Invoice Number", "dir": "ltr", "width": 61.61760000000001, "height": 8.8, "fontName": "g_d0_f2" }, { "x": 451.935, "y": 125.59899999999993, "str": "INV-3337", "dir": "ltr", "width": 37.171200000000006, "height": 8.8, "fontName": "g_d0_f2" }, ], } — Ayushi Gupta
– Ayushi Gupta, Commented Aug 22, 2018 at 8:45

Liberateur · Accepted Answer · 2018-08-22 09:54:04Z

1

You must write a function by invoice format (fn company1, fn company2...).

Here is an example with three different functions to retrieve data in the export of the pdf.js-extract module:

// Sample invoice
let sampleInvoice =
{
  "pages":
  [
    {
      "content":
      [
        {
          "x": 348.41,
          "y": 125.59899999999993,
          "str": "Invoice Number",
          "dir": "ltr",
          "width": 61.61760000000001,
          "height": 8.8,
          "fontName": "g_d0_f2"
        },
        {
          "x": 451.935,
          "y": 125.59899999999993,
          "str": "INV-3337",
          "dir": "ltr",
          "width": 37.171200000000006,
          "height": 8.8,
          "fontName": "g_d0_f2"
        }
      ]
    }
  ]
};


// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));


function searchByPosition(pages,x,y)
{
    // Set position range (difference max)
    let range = 10;

    // Init x and y positions
    x = Math.floor(x/range), y = Math.floor(y/range);

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test position x and y and if match return content
            if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}


function searchByPrev(pages,txt)
{
    // Init txt
    txt = txt.toLowerCase();

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test text  and if match return next content
            // (If you write j-1, you can have searchByNext function)
            if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])

                // Return result
                return pages[i].content[j+1].str;

    // No results found
    return 'NotFound';
}


function searchByFormat(pages,regex)
{
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test regex and if match return content
            if(regex.test(pages[i].content[j].str))

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}

TRY HERE : https://jsfiddle.net/dkhqzg6s/

answered Aug 22, 2018 at 9:54

Liberateur

1,4971 gold badge16 silver badges33 bronze badges

Sign up to request clarification or add additional context in comments.

3 Comments

Ayushi Gupta Over a year ago

Thanks alot LIberateur.But the response i have sent you is a just glimpse of it ..I have to store the invoice number , tax , due date etc into the mongodb collection . for this what should i do ??

Ayushi Gupta Over a year ago

Through this approach we have to make seperate function to fetch invoice data . Can u suggest some another way to do it.Any help will be appreciated.thanks!

Liberateur Over a year ago

You must use the functions for each desired data and write a function for each type of invoice. I do not see how to do otherwise unless you want to enter the field of deep learning; but you will lose too much time compared to my first solution.

sanjeev kumar · Accepted Answer · 2018-08-22 09:58:26Z

0

Please refer to GitHub Repository of pdf.js-extract npm module https://github.com/ffalt/pdf.js-extract

Below file is given at example/example.js path at github link

var fs = require('fs');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
pdfExtract.extract('./example.pdf', {} /* options*/, function (err, data) {
    if (err) return console.log(err);
    fs.writeFileSync('./example-output.json', JSON.stringify(data, null, '\t'));
    var lines = PDFExtract.utils.pageToLines(data.pages[0], 2);
    var rows = PDFExtract.utils.extractTextRows(lines);
    var text = rows.map(function (row) {
        return row.join('');
    }).join('\n');
    fs.writeFileSync('./example-output.txt', text);
    console.log(JSON.stringify(data, null, '\t'));
});

Hope it will work for you

answered Aug 22, 2018 at 9:58

sanjeev kumar

712 silver badges3 bronze badges

1 Comment

sanjeev kumar Over a year ago

require('../lib') points to lib folder and PDFExtract is the Module defined there. Please have a look in to gitHub Link

Fairouz Amor · Accepted Answer · 2020-01-28 13:40:21Z

using the pdf-extract npm package (https://www.npmjs.com/package/pdf-extract) allows you to extract text from a pdf .

// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract

var extract = (function() {

  'use strict';

  var fs = require('fs');
  var path = require('path');
  var pdfExtract = require('pdf-extract');

  var defaultOptions = {
    type: 'ocr',
    ocr_flags: [
      '-l eng',
    ]
  };

  // Execute script if not used as a module
  if (!module.parent) {

    init(process.argv[2]);
  }

  function init(filePath, options, callback) {

    callback = callback || function (error, response) {

      if (error) { return console.error(error); }

      return console.log(response);
    };

    options = options || defaultOptions;

    if (!filePath) {

      return callback(new Error('No input file (PDF) specified.'));
    }

    processFile(filePath, ocrLanguage, callback);
  }

  function processFile(filePath, ocrLanguage, callback) {

    var processor = pdfExtract(filePath, options, function (error) {

      if (error) {

        callback(error);
      }
    });

    processor.on('complete', function (data) {

      saveFile(filePath + '.txt', data.text_pages, callback);
    });

    processor.on('error', function (error) {

      callback(error);
    });
  }

  function saveFile(filePath, string, callback) {

    // Normalize file path
    filePath = path.normalize(filePath);

    try {

      callback('Saved file ' + filePath);

      // Save file
      return fs.writeFileSync(filePath, string, 'utf8');
    } catch (error) {

      callback(error);
    }
  }

  module.exports = {

    init: init
  };
}());

Dario Paez · Accepted Answer · 2021-04-30 19:33:28Z

File readPdf.js

const readPdf = (file) => new Promise((resolve, reject) => {
      try {
          pdfExtract.extract( file, function (error, text) {
            (error) ? reject(new Error('El archivo no se pudo leer')) : resolve(text)
            return text;
          });
        // Set up the timeout
        setTimeout(function () {
          reject('Promise timed out after ' + 10000 + ' ms');
        }, 10000);
        return data;
      } catch (error) {
        return false;
      }
    });
    module.exports = { readPdf };

File xxx.js

var {readPdf}= require('readPdf');   

  readPdf(files)
      .then(response => {
       console.log(response) // this is your data is 
        }).catch(err => console.log(err));
        return response;
      });

Collectives™ on Stack Overflow

How to Extract data from pdf file in nodejs

4 Answers 4

3 Comments

1 Comment

Comments

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

4 Answers 4

3 Comments

1 Comment

Comments

Comments

Your Answer

Sign up or log in

Post as a guest

Related