Scan and OCR PDFs in C#

This guide shows how to convert a physical document into a searchable 1.4 PDF/A document using the document feeder of a scanner. It makes use of the TWAIN protocol.

In the code below, you do the following:

  1. Configure the scanner using the TWAIN protocol.
  2. Create a PDF file to store the scanned pages.
  3. Read the image-based pages from the scanner one by one.
  4. Add the scanned pages to the PDF document.
  5. OCR each page.
  6. Close the TWAIN source after scanning the page and clean the resources.
// We assume GdPicture has been correctly installed and unlocked.
int ImageID = 0;
bool bContinue = false;
string message = "Done !\n";
GdPictureImaging oGdPictureImaging = new GdPictureImaging();
GdPicturePDF oGdPicturePDF = new GdPicturePDF();
if (oGdPictureImaging.TwainSelectSource(this.Handle) &&
    oGdPictureImaging.TwainOpenDefaultSource(this.Handle))
{
    oGdPictureImaging.TwainOpenDefaultSource(this.Handle);
    oGdPictureImaging.TwainSetAutoFeed(true); //Enabling AutoFeed option.
    oGdPictureImaging.TwainSetAutoScan(true); //Achieving the maximum scanning rate.
    oGdPictureImaging.TwainSetResolution(200);
    oGdPictureImaging.TwainSetPixelType(TwainPixelType.TWPT_BW); //Setting the image to be Black & White.
    oGdPictureImaging.TwainSetBitDepth(1); //1 bpp
    oGdPicturePDF.NewPDF(PdfConformance.PDF_A_1b); //Creating the destination PDF document.
    do
    {
        ImageID = oGdPictureImaging.TwainAcquireToGdPictureImage(this.Handle);
        if (oGdPictureImaging.GetStat() == GdPictureStatus.OK)
        {
            // Creating an image-based page in the destination document.
            if (oGdPicturePDF.AddImageFromGdPictureImage(ImageID, false, false) == GdPictureStatus.OK)
            {
                // OCR-ing the currently created page, if creation was successful.
                oGdPicturePDF.OcrPage("eng", "C:\\GdPicture.NET 14\\Redist\\OCR", "", 300);
            }
            message = message + "Page nr." + oGdPicturePDF.GetCurrentPage().ToString() + " - status: " + oGdPicturePDF.GetStat().ToString() + "\n";
            // Releasing the image.
            oGdPictureImaging.ReleaseGdPictureImage(ImageID);
        }
        if (oGdPictureImaging.TwainGetState() <= TwainStatus.TWAIN_SOURCE_ENABLED)
        {
            if (MessageBox.Show("Do you want to acquire other pages?", "", MessageBoxButtons.YesNo, MessageBoxIcon.Question) == DialogResult.Yes)
            {
                bContinue = true;
            }
            else
            {
                bContinue = false;
            }
        }
        else
        {
            bContinue = true;
        }
    } while (bContinue);
    oGdPicturePDF.SaveToFile("pdfocr.pdf", true);
    message = message + "Saving - status: " + oGdPicturePDF.GetStat().ToString();
    oGdPictureImaging.TwainCloseSource();
    MessageBox.Show(message, "TWAIN + OCR Example", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
else
{
    MessageBox.Show("Can't open the default source.\nresult code: " + oGdPictureImaging.TwainGetLastResultCode() +
                               "\ncondition code: " + oGdPictureImaging.TwainGetLastConditionCode(), "TWAIN + OCR Example", MessageBoxButtons.OK, MessageBoxIcon.Error);
}
oGdPictureImaging.Dispose();
oGdPicturePDF.Dispose();
'We assume GdPicture has been correctly installed and unlocked.
Dim ImageID As Integer = 0
Dim bContinue As Boolean = False
Dim message As String = "Done !" + vbCrLf
Dim oGdPictureImaging As New GdPictureImaging()
Dim oGdPicturePDF As New GdPicturePDF()
If (oGdPictureImaging.TwainSelectSource(Me.Handle) AndAlso
    oGdPictureImaging.TwainOpenDefaultSource(Me.Handle)) Then
    oGdPictureImaging.TwainSetAutoFeed(True) 'Enabling AutoFeed option.
    oGdPictureImaging.TwainSetAutoScan(True) 'Achieving the maximum scanning rate.
    oGdPictureImaging.TwainSetResolution(200)
    oGdPictureImaging.TwainSetPixelType(TwainPixelType.TWPT_BW) 'Setting the image to be Black & White.
    oGdPictureImaging.TwainSetBitDepth(1) '1 bpp
    'Creating the destination PDF document.
    oGdPicturePDF.NewPDF(PdfConformance.PDF_A_1b)
    Do
        ImageID = oGdPictureImaging.TwainAcquireToGdPictureImage(Me.Handle)
        If oGdPictureImaging.GetStat() = GdPictureStatus.OK
            'Creating an image-based page in the destination document.
            If oGdPicturePDF.AddImageFromGdPictureImage(ImageID, false, false) = GdPictureStatus.OK Then
                'OCR-ing the currently created page, if creation was successful.
                oGdPicturePDF.OcrPage("eng", "C:\GdPicture.NET 14\Redist\OCR", "", 300)
            End If
            message = message + "Page nr." + oGdPicturePDF.GetCurrentPage().ToString() + " - status: " + oGdPicturePDF.GetStat().ToString() + vbCrLf
            'Releasing the image.
            oGdPictureImaging.ReleaseGdPictureImage(ImageID)
        End If
        If oGdPictureImaging.TwainGetState() <= TwainStatus.TWAIN_SOURCE_ENABLED Then
            If MessageBox.Show("Do you want to acquire other pages?", "", MessageBoxButtons.YesNo, MessageBoxIcon.Question) = DialogResult.Yes Then
                bContinue = True
            Else
                bContinue = False
            End If
        Else
             bContinue = True
        End If
    Loop While bContinue
    oGdPicturePDF.SaveToFile("pdfocr.pdf", True)
    message = message + "Saving - status: " + oGdPicturePDF.GetStat().ToString()
    oGdPictureImaging.TwainCloseSource()
    MessageBox.Show(message, "TWAIN + OCR Example", MessageBoxButtons.OK, MessageBoxIcon.Information)
Else
    MessageBox.Show("Can't open the default source.\nresult code: " + oGdPictureImaging.TwainGetLastResultCode() +
                               "\ncondition code: " + oGdPictureImaging.TwainGetLastConditionCode(), "TWAIN + OCR Example", MessageBoxButtons.OK, MessageBoxIcon.Error)
End If
oGdPictureImaging.Dispose()
oGdPicturePDF.Dispose()