7

I have already used 'pdf.js-extract' npm module to fetch data from pdf.

var PDFExtract = require('pdf.js-extract').PDFExtract;

var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"

pdfExtract.extract(filename , function (err, data) {
    if (err) return console.log(err);
    console.log(JSON.stringify(data));
});

But I am not getting the desired result. I want to fetch the relevant information from invoice pdf like tax, total amount paid, seller address and save the data fetched into the mongodb collection

5
  • 1
    It will always be the same invoice format (positions)? Can you show us the return of console.log(JSON.stringify(data));? Commented Aug 22, 2018 at 8:02
  • Actually the pdf is generated by the marketplace likes amozon , flipkart etc so it can be different in format. Commented Aug 22, 2018 at 8:11
  • The module will extract the data, but they will have a different format ... So either create a function by type of invoice that will take the data as on the base template, or create a script that will recognize the content ... Commented Aug 22, 2018 at 8:18
  • Send an example of what you are getting console.log(JSON.stringify(data)); Commented Aug 22, 2018 at 8:18
  • the response i am getting is { "pages": [ { "content": [ { "x": 348.41, "y": 125.59899999999993, "str": "Invoice Number", "dir": "ltr", "width": 61.61760000000001, "height": 8.8, "fontName": "g_d0_f2" }, { "x": 451.935, "y": 125.59899999999993, "str": "INV-3337", "dir": "ltr", "width": 37.171200000000006, "height": 8.8, "fontName": "g_d0_f2" }, ], } Commented Aug 22, 2018 at 8:45

4 Answers 4

1

You must write a function by invoice format (fn company1, fn company2...).

Here is an example with three different functions to retrieve data in the export of the pdf.js-extract module:

// Sample invoice
let sampleInvoice =
{
  "pages":
  [
    {
      "content":
      [
        {
          "x": 348.41,
          "y": 125.59899999999993,
          "str": "Invoice Number",
          "dir": "ltr",
          "width": 61.61760000000001,
          "height": 8.8,
          "fontName": "g_d0_f2"
        },
        {
          "x": 451.935,
          "y": 125.59899999999993,
          "str": "INV-3337",
          "dir": "ltr",
          "width": 37.171200000000006,
          "height": 8.8,
          "fontName": "g_d0_f2"
        }
      ]
    }
  ]
};


// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));


function searchByPosition(pages,x,y)
{
    // Set position range (difference max)
    let range = 10;

    // Init x and y positions
    x = Math.floor(x/range), y = Math.floor(y/range);

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test position x and y and if match return content
            if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}


function searchByPrev(pages,txt)
{
    // Init txt
    txt = txt.toLowerCase();

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test text  and if match return next content
            // (If you write j-1, you can have searchByNext function)
            if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])

                // Return result
                return pages[i].content[j+1].str;

    // No results found
    return 'NotFound';
}


function searchByFormat(pages,regex)
{
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test regex and if match return content
            if(regex.test(pages[i].content[j].str))

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}

TRY HERE : https://jsfiddle.net/dkhqzg6s/

Sign up to request clarification or add additional context in comments.

3 Comments

Thanks alot LIberateur.But the response i have sent you is a just glimpse of it ..I have to store the invoice number , tax , due date etc into the mongodb collection . for this what should i do ??
Through this approach we have to make seperate function to fetch invoice data . Can u suggest some another way to do it.Any help will be appreciated.thanks!
You must use the functions for each desired data and write a function for each type of invoice. I do not see how to do otherwise unless you want to enter the field of deep learning; but you will lose too much time compared to my first solution.
0

Please refer to GitHub Repository of pdf.js-extract npm module https://github.com/ffalt/pdf.js-extract

Below file is given at example/example.js path at github link

var fs = require('fs');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
pdfExtract.extract('./example.pdf', {} /* options*/, function (err, data) {
    if (err) return console.log(err);
    fs.writeFileSync('./example-output.json', JSON.stringify(data, null, '\t'));
    var lines = PDFExtract.utils.pageToLines(data.pages[0], 2);
    var rows = PDFExtract.utils.extractTextRows(lines);
    var text = rows.map(function (row) {
        return row.join('');
    }).join('\n');
    fs.writeFileSync('./example-output.txt', text);
    console.log(JSON.stringify(data, null, '\t'));
});

Hope it will work for you

1 Comment

require('../lib') points to lib folder and PDFExtract is the Module defined there. Please have a look in to gitHub Link
-1

using the pdf-extract npm package (https://www.npmjs.com/package/pdf-extract) allows you to extract text from a pdf .

// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract

var extract = (function() {

  'use strict';

  var fs = require('fs');
  var path = require('path');
  var pdfExtract = require('pdf-extract');

  var defaultOptions = {
    type: 'ocr',
    ocr_flags: [
      '-l eng',
    ]
  };

  // Execute script if not used as a module
  if (!module.parent) {

    init(process.argv[2]);
  }

  function init(filePath, options, callback) {

    callback = callback || function (error, response) {

      if (error) { return console.error(error); }

      return console.log(response);
    };

    options = options || defaultOptions;

    if (!filePath) {

      return callback(new Error('No input file (PDF) specified.'));
    }

    processFile(filePath, ocrLanguage, callback);
  }

  function processFile(filePath, ocrLanguage, callback) {

    var processor = pdfExtract(filePath, options, function (error) {

      if (error) {

        callback(error);
      }
    });

    processor.on('complete', function (data) {

      saveFile(filePath + '.txt', data.text_pages, callback);
    });

    processor.on('error', function (error) {

      callback(error);
    });
  }

  function saveFile(filePath, string, callback) {

    // Normalize file path
    filePath = path.normalize(filePath);

    try {

      callback('Saved file ' + filePath);

      // Save file
      return fs.writeFileSync(filePath, string, 'utf8');
    } catch (error) {

      callback(error);
    }
  }

  module.exports = {

    init: init
  };
}());

Comments

-1

File readPdf.js

const readPdf = (file) => new Promise((resolve, reject) => {
      try {
          pdfExtract.extract( file, function (error, text) {
            (error) ? reject(new Error('El archivo no se pudo leer')) : resolve(text)
            return text;
          });
        // Set up the timeout
        setTimeout(function () {
          reject('Promise timed out after ' + 10000 + ' ms');
        }, 10000);
        return data;
      } catch (error) {
        return false;
      }
    });
    module.exports = { readPdf };

File xxx.js

var {readPdf}= require('readPdf');   

  readPdf(files)
      .then(response => {
       console.log(response) // this is your data is 
        }).catch(err => console.log(err));
        return response;
      });

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.