using IronPdf; // Disable local disk access or cross-origin requests Installation.EnableWebSecurity = true; // Instantiate Renderer var renderer = new ChromePdfRenderer(); // Create a PDF from a HTML string using C# var pdf = renderer.RenderHtmlAsPdf("<h1>Hello World</h1>"); // Export to a file or Stream pdf.SaveAs("output.pdf"); // Advanced Example with HTML Assets // Load external html assets: Images, CSS and JavaScript. // An optional BasePath 'C:\site\assets\' is set as the file location to load assets from var myAdvancedPdf = renderer.RenderHtmlAsPdf("<img src='icons/iron.png'>", @"C:\site\assets\"); myAdvancedPdf.SaveAs("html-with-assets.pdf");

using IronPdf; using System; // Step 1. Creating a PDF with editable forms from HTML using form and input tags // Radio Button and Checkbox can also be implemented with input type 'radio' and 'checkbox' const string formHtml = @" <html> <body> <h2>Editable PDF Form</h2> <form> First name: <br> <input type='text' name='firstname' value=''> <br> Last name: <br> <input type='text' name='lastname' value=''> <br> <br> <p>Please specify your gender:</p> <input type='radio' id='female' name='gender' value= 'Female'> <label for='female'>Female</label> <br> <br> <input type='radio' id='male' name='gender' value='Male'> <label for='male'>Male</label> <br> <br> <input type='radio' id='non-binary/other' name='gender' value='Non-Binary / Other'> <label for='non-binary/other'>Non-Binary / Other</label> <br> <p>Please select all medical conditions that apply:</p> <input type='checkbox' id='condition1' name='Hypertension' value='Hypertension'> <label for='condition1'> Hypertension</label><br> <input type='checkbox' id='condition2' name='Heart Disease' value='Heart Disease'> <label for='condition2'> Heart Disease</label><br> <input type='checkbox' id='condition3' name='Stoke' value='Stoke'> <label for='condition3'> Stoke</label><br> <input type='checkbox' id='condition4' name='Diabetes' value='Diabetes'> <label for='condition4'> Diabetes</label><br> <input type='checkbox' id='condition5' name='Kidney Disease' value='Kidney Disease'> <label for='condition5'> Kidney Disease</label><br> </form> </body> </html>"; // Instantiate Renderer var renderer = new ChromePdfRenderer(); renderer.RenderingOptions.CreatePdfFormsFromHtml = true; renderer.RenderHtmlAsPdf(formHtml).SaveAs("BasicForm.pdf"); // Step 2. Reading and Writing PDF form values. var FormDocument = PdfDocument.FromFile("BasicForm.pdf"); // Set and Read the value of the "firstname" field var FirstNameField = FormDocument.Form.FindFormField("firstname"); FirstNameField.Value = "Minnie"; Console.WriteLine("FirstNameField value: {0}", FirstNameField.Value); // Set and Read the value of the "lastname" field var LastNameField = FormDocument.Form.FindFormField("lastname"); LastNameField.Value = "Mouse"; Console.WriteLine("LastNameField value: {0}", LastNameField.Value); FormDocument.SaveAs("FilledForm.pdf");

使用IRONPDF

如何在C#中读取PDF表格

更新 2024年三月3日

用 C# 从 PDF 文件中提取数据是一项相当大的挑战。数据可以是文本、图像、图表、图形、表格等形式。有时，业务分析人员需要提取数据以进行数据分析，并根据分析结果做出决策。IronPDF C# PDF 库**是从 PDF 文件中提取数据的绝佳解决方案。

本文将演示如何使用 IronPDF 库用 C# 从 PDF 文档中提取表格数据。

如何在C#中读取PDF表格

安装 C# 库读取 PDF 表格
利用 将Html渲染为Pdf 方法从 HTML 字符串渲染 PDF 文档
使用 提取所有文本 方法从 PDF 中读取表格数据
输出并查看提取的数据
使用 C# 将提取的数据保存为 CSV 文件格式

IronPDF - C# PDF Library

IronPDF 是一个 C# .NET 库该软件可帮助开发人员在其软件应用程序中轻松读取、创建和编辑 PDF 文档。它的 Chromium 引擎能准确、快速地渲染 PDF 文档。它允许开发人员从不同格式无缝转换为 PDF，反之亦然。它支持最新的 .NET 7 Framework 以及 .NET Framework 6、5、4、.NET Core 和 Standard。

此外，IronPDF .NET API 还能让开发人员轻松操作和编辑 PDF、添加页眉和页脚，以及从 PDF 中提取文本、图像和表格。

一些重要功能包括

加载和创建 PDF 文件 (HTML 转 PDF, 图像到 PDF)
保存和打印 PDF 文件
合并和分割 PDF 文件
提取数据 (文本、图片、表格) 从 PDF 文件

在 C&num 中提取表格数据的步骤；使用 IronPDF 库

要从 PDF 文档中提取表格数据，我们需要在本地计算机系统中安装以下组件：

Visual Studio - Visual Studio 2022 是用于 C# 开发的官方集成开发环境，必须安装在计算机上。请从 Visual Studio 网站.
创建项目 - 创建一个用于提取数据的控制台应用程序。请按照以下步骤创建项目：
- 打开 Visual Studio 2022，然后点击创建新项目按钮
  视觉工作室的启动屏幕
- 下一步，选择 C# 控制台应用程序并点击下一步
  在 Visual Studio 中创建新的控制台应用程序
- 接下来，键入项目名称"ReadPDFTable "，然后点击下一步
  配置新创建的应用程序
- 为您的项目选择".NET Framework 6 长期支持"。
  选择一个 .NET 框架
- 单击创建按钮，控制台项目将被创建。现在，我们可以通过编程从 PDF 文档中提取表格数据了。
安装 IronPDF - 有 3 种不同的方法来安装 IronPDF 库。具体如下
- 使用 Visual Studio。Visual Studio 包含 NuGet 包管理器，可帮助在 C# 应用程序中安装所有 NuGet 包。
  - 单击顶部菜单中的 "工具"，或
  - 右键单击解决方案资源管理器中的项目
    工具和管理 NuGet 软件包
  - 打开 NuGet 软件包管理器后，浏览 IronPDF 并点击安装，如下图所示：
    工具和管理 NuGet 软件包
- 直接下载 NuGet 包 另一种下载和安装 IronPDF 的简单方法是访问其在 NuGet 网站.
- 下载 IronPDF .DLL 库。 IronPDF 也可以从 IronPDF 网站下载。点击 IronPDF DLL 下载下载并安装。请记住，您必须在项目中引用 .DLL 才能使用它。

创建带表格数据的 PDF 文档

在创建任何内容之前，需要将 IronPDF 命名空间添加到文件中，并设置许可证密钥以使用 IronPDF 库中的 ExtractText 方法。

using IronPdf;

License.LicenseKey = "YOUR-TRIAL/PURCHASED-LICENSE-KEY";

using IronPdf;

License.LicenseKey = "YOUR-TRIAL/PURCHASED-LICENSE-KEY";

Imports IronPdf

License.LicenseKey = "YOUR-TRIAL/PURCHASED-LICENSE-KEY"

VB C#

在这里，将从包含表格的 HTML 字符串创建 PDF 文档，然后使用 IronPDF 提取数据。HTML 保存在一个字符串变量中，代码如下：

string HTML = "<html>" +
        "<style>" +
            "table, th, td {" +
                "border:1px solid black;" +
            "}" +
        "</style>" +
        "<body>" +
            "<h1>A Simple table example</h2>" +
            "<table>" +
                "<tr>" +
                    "<th>Company</th>" +
                    "<th>Contact</th>" +
                    "<th>Country</th>" +
                "</tr>" +
                "<tr>" +
                    "<td>Alfreds Futterkiste</td>" +
                    "<td>Maria Anders</td>" +
                    "<td>Germany</td>" +
                "</tr>" +
                "<tr>" +
                    "<td>Centro comercial Moctezuma</td>" +
                    "<td>Francisco Chang</td>" +
                    "<td>Mexico</td>" +
                "</tr>" +
            "</table>" +
            "<p>To understand the example better, we have added borders to the table.</p>" +
        "</body>" +
     "</html>";

string HTML = "<html>" +
        "<style>" +
            "table, th, td {" +
                "border:1px solid black;" +
            "}" +
        "</style>" +
        "<body>" +
            "<h1>A Simple table example</h2>" +
            "<table>" +
                "<tr>" +
                    "<th>Company</th>" +
                    "<th>Contact</th>" +
                    "<th>Country</th>" +
                "</tr>" +
                "<tr>" +
                    "<td>Alfreds Futterkiste</td>" +
                    "<td>Maria Anders</td>" +
                    "<td>Germany</td>" +
                "</tr>" +
                "<tr>" +
                    "<td>Centro comercial Moctezuma</td>" +
                    "<td>Francisco Chang</td>" +
                    "<td>Mexico</td>" +
                "</tr>" +
            "</table>" +
            "<p>To understand the example better, we have added borders to the table.</p>" +
        "</body>" +
     "</html>";

Dim HTML As String = "<html>" & "<style>" & "table, th, td {" & "border:1px solid black;" & "}" & "</style>" & "<body>" & "<h1>A Simple table example</h2>" & "<table>" & "<tr>" & "<th>Company</th>" & "<th>Contact</th>" & "<th>Country</th>" & "</tr>" & "<tr>" & "<td>Alfreds Futterkiste</td>" & "<td>Maria Anders</td>" & "<td>Germany</td>" & "</tr>" & "<tr>" & "<td>Centro comercial Moctezuma</td>" & "<td>Francisco Chang</td>" & "<td>Mexico</td>" & "</tr>" & "</table>" & "<p>To understand the example better, we have added borders to the table.</p>" & "</body>" & "</html>"

VB C#

接下来是 ChromePdfRenderer 用于从 HTML 字符串创建 PDF。代码如下

ChromePdfRenderer renderer = new ChromePdfRenderer();
PdfDocument pdfDocument = renderer.RenderHtmlAsPdf(HTML);
pdfDocument.SaveAs("table_example.pdf");

ChromePdfRenderer renderer = new ChromePdfRenderer();
PdfDocument pdfDocument = renderer.RenderHtmlAsPdf(HTML);
pdfDocument.SaveAs("table_example.pdf");

Dim renderer As New ChromePdfRenderer()
Dim pdfDocument As PdfDocument = renderer.RenderHtmlAsPdf(HTML)
pdfDocument.SaveAs("table_example.pdf")

VB C#

"(《世界人权宣言》) 保存为方法将保存 PDFDocument 对象的 PDF 文件，文件名为 "table/example.pdf "。保存的文件如下所示：

如何在 C# 中读取 PDF 表，图 7：在 NuGet 软件包管理器用户界面中搜索 IronPDF

在 NuGet 软件包管理器用户界面中搜索 IronPDF

使用 IronPDF 从 PDF 文档中提取表格数据

要从 PDF 表格中提取数据，请使用 PdfDocument 对象打开文档，然后使用提取所有文本方法检索数据，以便进一步分析。下面的代码演示了如何完成这一任务：

PdfDocument pdfDocument = new PdfDocument("table_example.pdf");
string text = pdfDocument.ExtractAllText();

PdfDocument pdfDocument = new PdfDocument("table_example.pdf");
string text = pdfDocument.ExtractAllText();

Dim pdfDocument As New PdfDocument("table_example.pdf")
Dim text As String = pdfDocument.ExtractAllText()

VB C#

上述代码使用 "ExtractAllText "方法分析整个 PDF 文档，并将提取的数据（包括表格数据）以字符串变量的形式返回。该变量的值可以显示或存储在文件中供以后使用。下面的代码会在屏幕上显示：

Console.WriteLine("The extracted Text is:\n" + text);

Console.WriteLine("The extracted Text is:\n" + text);

Imports Microsoft.VisualBasic

Console.WriteLine("The extracted Text is:" & vbLf & text)

VB C#

如何在 C# 中读取 PDF 表，图 8：提取 PDF 文件中的文本

提取 PDF 文件中的文本

从提取的文本内容中提取表格数据

C# 提供了一个 String.Split 方法，可帮助根据分隔符分割字符串。以下代码将帮助您将输出限制为表格数据。

string [] textList = text.Split("\n");
foreach (string textItem in textList)
{
    if (textItem.Contains("."))
    {
        continue;
    }
    else
    {
        Console.WriteLine(textItem);
    }
}

string [] textList = text.Split("\n");
foreach (string textItem in textList)
{
    if (textItem.Contains("."))
    {
        continue;
    }
    else
    {
        Console.WriteLine(textItem);
    }
}

Imports Microsoft.VisualBasic

Dim textList() As String = text.Split(vbLf)
For Each textItem As String In textList
	If textItem.Contains(".") Then
		Continue For
	Else
		Console.WriteLine(textItem)
	End If
Next textItem

VB C#

这个简单的代码示例有助于从提取的文本中只提取表格单元格数据。首先，文本行被分割并保存在字符串数组中。然后，迭代每个数组元素，跳过末尾带有句号". "的元素。在大多数情况下，从提取的数据中只检索表格数据，但也可能检索其他行。输出结果如下

如何在 C# 中读取 PDF 表格，图 9：控制台显示提取的文本

控制台显示提取的文本

从上面的截图中可以看出，表格数据格式和逻辑结构在 Console.WriteLine 方法输出中得到了保留。有关如何使用 IronPDF 从 PDF 文档中提取数据的更多详情，请参阅以下内容代码示例.

输出结果还可保存为 CSV 文件，稍后可对其进行格式化和编辑，以便进行更多数据分析。代码如下

using (StreamWriter file = new StreamWriter("table_example.csv", false))
{
    string [] textList = text.Split("\n");
    foreach (string textItem in textList)
    {
        if (textItem.Contains("."))
        {
            continue;
        }
        else
        {
            file.WriteLine(textItem);
        }
    }
}

using (StreamWriter file = new StreamWriter("table_example.csv", false))
{
    string [] textList = text.Split("\n");
    foreach (string textItem in textList)
    {
        if (textItem.Contains("."))
        {
            continue;
        }
        else
        {
            file.WriteLine(textItem);
        }
    }
}

Imports Microsoft.VisualBasic

Using file As New StreamWriter("table_example.csv", False)
	Dim textList() As String = text.Split(vbLf)
	For Each textItem As String In textList
		If textItem.Contains(".") Then
			Continue For
		Else
			file.WriteLine(textItem)
		End If
	Next textItem
End Using

VB C#

输出结果将保存为 CSV 文件，其中每个 textItem 为一列。

摘要

本文演示了如何使用 IronPDF 从 PDF 文档中提取数据和表格。IronPDF 为从 PDF 文件中提取文本提供了几个有用的选项。它提供了从页面提取文本方法，该方法允许从特定页面提取数据。IronPDF 还可以将不同格式的文件转换为 PDF，例如标记符文件或 DOCX 文件以及将 PDF 转换为不同格式。这使得开发人员可以轻松地将 PDF 功能集成到应用程序开发过程中。此外，它不需要 Adobe Acrobat Reader 就能查看和编辑 PDF 文档。

IronPDF 可免费用于开发，也可授权用于商业用途。它提供了一个免费试用许可证来测试该库的全部功能。更多详细信息，请访问此链接。

< 前一页
如何将二维码转换为PDF

下一步 >
PDF查看器C# Windows应用程序（教程）