diff --git a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md index 2156093d70..59849d4a32 100644 --- a/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md +++ b/Document-Processing/Data-Extraction/NET/conversions/pdf-to-markdown.md @@ -169,6 +169,104 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess {% endtabs %} +## Customize image saving + +The [ImageNodeVisited](https://help.syncfusion.com/cr/document-processing/Syncfusion.DocIO.DLS.SaveOptions.html#Syncfusion_DocIO_DLS_SaveOptions_ImageNodeVisited) event in the Syncfusion® Smart Data Extractor allows users to customize how images are saved during data extraction. With this event, you can: + +* Customize image names and storage paths, and save images externally. +* Replace Base64 content with a file path for optimized storage. + +### Extract Markdown with external image saving + +The following code shows how to use the [ExtractDataAsMarkdown](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsMarkdown_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class with the [ImageNodeVisited](https://help.syncfusion.com/cr/document-processing/Syncfusion.DocIO.DLS.SaveOptions.html#Syncfusion_DocIO_DLS_SaveOptions_ImageNodeVisited) event to customize image saving while exporting content as Markdown. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using Syncfusion.Office.Markdown; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF or Image file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Hook the event to customize image handling. + extractor.SaveOptions.ImageNodeVisited += SaveImage; + //Extract Markdown content as string. + string data = extractor.ExtractDataAsMarkdown(inputStream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("DataToMarkdown.md", data); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using Syncfusion.Office.Markdown; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF or Image file as a stream. +using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Hook the event to customize image handling. + extractor.SaveOptions.ImageNodeVisited += SaveImage; + //Extract Markdown content as string. + string data = extractor.ExtractDataAsMarkdown(inputStream); + //Save the extracted Markdown data into an output file. + File.WriteAllText("DataToMarkdown.md", data); +} + +{% endhighlight %} + +{% endtabs %} + +The following code shows how to implement the event handler to customize the image path and save images externally. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +//Event handler to save images externally +static void SaveImage(object sender, MdImageNodeVisitedEventArgs args) +{ + //Define output image path (customize naming logic as needed) + string imagePath = @"D:\Temp\Image1.png"; + //Save the image stream to file + using (FileStream fileStreamOutput = File.Create(imagePath)) + { + args.ImageStream.CopyTo(fileStreamOutput); + } + //Set the URI to be used in the Markdown output + args.Uri = imagePath; +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +//Event handler to save images externally +static void SaveImage(object sender, MdImageNodeVisitedEventArgs args) +{ + //Define output image path (customize naming logic as needed) + string imagePath = @"D:\Temp\Image1.png"; + //Save the image stream to file + using (FileStream fileStreamOutput = File.Create(imagePath)) + { + args.ImageStream.CopyTo(fileStreamOutput); + } + //Set the URI to be used in the Markdown output + args.Uri = imagePath; +} + +{% endhighlight %} + +{% endtabs %} + + ## PDF to Markdown Preservation Mapping This section explains how common PDF elements are converted and preserved in Markdown format, ensuring that document structure and formatting remain consistent during the PDF to Markdown conversion process. diff --git a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md index ccf5c36ed3..4379b8cecc 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md +++ b/Document-Processing/Data-Extraction/NET/working-with-data-extraction.md @@ -320,6 +320,107 @@ using (FileStream inputStream = new FileStream("Input.pdf", FileMode.Open, FileA You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-as-stream/.NET). +## Convert PDF/Image to Markdown + +The **Smart Data Extractor** enables you to process PDF documents or scanned images and export the structured content as a MarkdownDocument (MD DOM). + +This section covers two scenarios: +* Extracting from PDF +* Extracting from Image + +### Extracting from PDF + +To extract structured data from a PDF document and save it as a Markdown document using the **ExtractDataAsMarkdownDocument** method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following code example: + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% endtabs %} + +### Extracting from Image + +To extract structured data from an image file and save it as a Markdown document using the **ExtractDataAsMarkdownDocument** method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following C# code example. + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input image file as a stream. +using (FileStream stream = new FileStream("Input.png", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using System.IO; +using Syncfusion.SmartDataExtractor; +using Syncfusion.Office.Markdown; + +// Open the input image file as a stream. +using (FileStream stream = new FileStream("Input.png", FileMode.Open, FileAccess.Read)) +{ + // Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + // Extract data as MarkdownDocument. + MarkdownDocument markdownDocument = extractor.ExtractDataAsMarkdownDocument(stream); + // Save the extracted Markdown data into an output file. + markdownDocument.Save("Output.md"); +} + +{% endhighlight %} + +{% endtabs %} + + ## Disable Form Detection To disable form field detection while extracting structured data from a PDF document using the [ExtractDataAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsJson_System_IO_Stream_) method of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, refer to the following code example: @@ -340,14 +441,11 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} @@ -364,11 +462,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableFormDetection = false; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -397,14 +493,11 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } - {% endhighlight %} {% highlight c# tabtitle="C# [Windows-specific]" %} @@ -421,11 +514,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //By default - true extractor.EnableTableDetection = false; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -470,11 +561,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Assign the configured form recognition options to the extractor. extractor.FormRecognizeOptions = formOptions; //Extract form data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -509,11 +598,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Assign the configured form recognition options to the extractor. extractor.FormRecognizeOptions = formOptions; //Extract form data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -552,11 +639,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Assign the table extraction options to the extractor. extractor.TableExtractionOptions = tableOptions; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -585,11 +670,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess // Assign the table extraction options to the extractor. extractor.TableExtractionOptions = tableOptions; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -619,11 +702,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //default confidence threshold value is 0.6 extractor.ConfidenceThreshold = 0.75; // Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -643,11 +724,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //default confidence threshold value is 0.6 extractor.ConfidenceThreshold = 0.75; // Extract data and return as a loaded json file. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - // Save the extracted output as a new json file. - pdf.Save("Output.json"); - // Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -675,11 +754,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Set the page range for extraction (pages 1 to 3). extractor.PageRange = new int[,] { { 1, 3 } }; //Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new PDF file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -697,11 +774,9 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess //Set the page range for extraction (pages 1 to 3). extractor.PageRange = new int[,] { { 1, 3 } }; //Extract data and return as a loaded json document. - PdfLoadedDocument pdf = extractor.ExtractDataAsJson(stream); - //Save the extracted output as a new json file. - pdf.Save("Output.json"); - //Close the document. - pdf.Close(true); + string data = extractor.ExtractDataAsJson(stream); + //Save the extracted JSON data into an output file. + File.WriteAllText("Output.json", data, Encoding.UTF8); } {% endhighlight %} @@ -711,5 +786,72 @@ using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess You can download a complete working sample from [GitHub](https://github.com/SyncfusionExamples/PDF-Examples/tree/master/Data-Extraction/Smart-Data-Extractor/Extract-data-within-specific-range/.NET). +## Configure OCR Processing Settings + +To configure OCR settings in .NET using the **OCRProcessor** property of the [DataExtractor](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html) class, use the following C# example to initialize the OCR processor, set language and Tesseract version, and extract structured data from a PDF document with the [ExtractDataAsPdfDocument](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartDataExtractor.DataExtractor.html#Syncfusion_SmartDataExtractor_DataExtractor_ExtractDataAsPdfDocument_System_IO_Stream_) method. + + +{% tabs %} + +{% highlight c# tabtitle="C# [Cross-platform]" %} + +using Syncfusion.Pdf.Parsing; +using Syncfusion.OCRProcessor; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Initialize the OCR processor. + OCRProcessor processor = new OCRProcessor(); + //Set OCR language. + processor.Settings.Language = Languages.English; + //Set Tesseract OCR engine version. + processor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Assign the configured OCR processor to the Data Extractor. + extractor.OCRProcessor = processor; + //Extract data and return as a loaded PDF document. + PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); + //Save the extracted output as a new PDF file. + pdf.Save("Output.pdf"); + //Close the document. + pdf.Close(true); +} + +{% endhighlight %} + +{% highlight c# tabtitle="C# [Windows-specific]" %} + +using Syncfusion.Pdf.Parsing; +using Syncfusion.OCRProcessor; +using Syncfusion.SmartDataExtractor; + +//Open the input PDF file as a stream. +using (FileStream stream = new FileStream("Input.pdf", FileMode.Open, FileAccess.Read)) +{ + //Initialize the Data Extractor. + DataExtractor extractor = new DataExtractor(); + //Initialize the OCR processor. + OCRProcessor processor = new OCRProcessor(); + //Set OCR language. + processor.Settings.Language = Languages.English; + //Set Tesseract OCR engine version. + processor.Settings.TesseractVersion = TesseractVersion.Version5_0; + //Assign the configured OCR processor to the Data Extractor. + extractor.OCRProcessor = processor; + //Extract data and return as a loaded PDF document. + PdfLoadedDocument pdf = extractor.ExtractDataAsPdfDocument(stream); + //Save the extracted output as a new PDF file. + pdf.Save("Output.pdf"); + //Close the document. + pdf.Close(true); +} + +{% endhighlight %} + +{% endtabs %} + diff --git a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md index 4e76f53b22..a68a71e331 100644 --- a/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md +++ b/Document-Processing/Data-Extraction/NET/working-with-form-recognition.md @@ -8,6 +8,10 @@ documentation: UG # Working with Form Recognition +The Syncfusion® Smart Form Recognizer is a C# library for .NET that reliably extracts form data from PDFs and scanned images. It detects text fields, checkboxes, radio buttons, and signature regions. +To quickly get started with recognizing form data from PDF and image files using the Smart Form Recognizer library, refer to this video tutorial: +{% youtube "https://www.youtube.com/watch?v=1F1jRW3JIB4" %} + ## Recognize Forms as JSON To recognize form data from a PDF or image and get the output as a JSON string using the [RecognizeFormAsJson](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html#Syncfusion_SmartFormRecognizer_FormRecognizer_RecognizeFormAsJson_System_IO_Stream_) (synchronous) and [RecognizeFormAsJsonAsync](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html#Syncfusion_SmartFormRecognizer_FormRecognizer_RecognizeFormAsJsonAsync_System_IO_Stream_System_Threading_CancellationToken_) (asynchronous) methods of the [FormRecognizer](https://help.syncfusion.com/cr/document-processing/Syncfusion.SmartFormRecognizer.FormRecognizer.html) class, refer to the following code examples.