NodeJS & Tesseract.js for OCR [Example]

A short example of recursing through a directory of scanned documents (JPGs) and performing Optical Character Recognition.

Start by installing Tesseract.js to do the OCR, glob to read the directory for filenames and fs-extra to write the txt files containing our cleaned text.

npm install tesseract.js glob fs-extra --save

Next the script ocr.js that uses these npm packages

// load our NPM packages
var Tesseract = require('tesseract.js');
var glob = require("glob")
var fs = require('fs-extra');
// define where the scanned docs can be found
var path = 'public/resized/';

// read the filenames in the path
glob(path + "*.jpg", function (er, files) {
// for each filename call the Tesseract function
   files.forEach(function(filename){
// set the language to French. Omitting this defaults to eng
   Tesseract.recognize(filename, 'fra')
     .progress(function (p) { console.log(filename, p) })
     .catch(err => console.error(err))
     .then(function (result) {
         var rawtxt = result.text;
// simple regex to replace all weird characters with a space
         var clntxt = rawtxt.replace(/[^a-z0-9./-]+/gi, ' ')
// write the scanned text to a text file
         fs.writeFileSync(filename+'.txt', clntxt, function(err){
            console.log('WROTE ' + filename + '.txt');
         });
      });
});
});

Run the script with

node ocr.js

It’s not fast but it works!

Advertisements