Merge pull request #188 from mplachter/add-pdf-document-loader

documentloaders: add pdf documentloader
tmc · Jul 13, 2023 · dcf7ecd · dcf7ecd
2 parents b593b11 + c27d44f
commit dcf7ecd
Show file tree

Hide file tree

Showing 7 changed files with 256 additions and 1 deletion.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -20,7 +20,7 @@ jobs:
       - name: golangci-lint
         uses: golangci/[email protected]
         with:
-          args: --timeout=2m
+          args: --timeout=4m
   build-examples:
     runs-on: ubuntu-latest
     steps:

diff --git a/documentloaders/pdf.go b/documentloaders/pdf.go
@@ -0,0 +1,113 @@
+package documentloaders
+
+import (
+	"context"
+	"io"
+
+	"github.com/ledongthuc/pdf"
+	"github.com/tmc/langchaingo/schema"
+	"github.com/tmc/langchaingo/textsplitter"
+)
+
+// PDF loads text data from an io.Reader.
+type PDF struct {
+	r        io.ReaderAt
+	s        int64
+	password string
+}
+
+var _ Loader = PDF{}
+
+// PDFOptions are options for the PDF loader.
+type PDFOptions func(pdf *PDF)
+
+// WithPassword sets the password for the PDF.
+func WithPassword(password string) PDFOptions {
+	return func(pdf *PDF) {
+		pdf.password = password
+	}
+}
+
+// NewText creates a new text loader with an io.Reader.
+func NewPDF(r io.ReaderAt, size int64, opts ...PDFOptions) PDF {
+	pdf := PDF{
+		r: r,
+		s: size,
+	}
+	for _, opt := range opts {
+		opt(&pdf)
+	}
+	return pdf
+}
+
+// getPassword returns the password for the PDF
+// it than clears the password on the struct so it can't be used again
+// if the password is cleared and tried to be used again it will fail.
+func (p *PDF) getPassword() string {
+	pass := p.password
+	p.password = ""
+	return pass
+}
+
+// Load reads from the io.Reader for the PDF data and returns the documents with the data and with
+// metadata attached of the page number and total number of pages of the PDF.
+func (p PDF) Load(_ context.Context) ([]schema.Document, error) {
+	var reader *pdf.Reader
+	var err error
+
+	if p.password != "" {
+		reader, err = pdf.NewReaderEncrypted(p.r, p.s, p.getPassword)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		reader, err = pdf.NewReader(p.r, p.s)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	numPages := reader.NumPage()
+
+	docs := []schema.Document{}
+
+	// fonts to be used when getting plain text from pages
+	fonts := make(map[string]*pdf.Font)
+	for i := 1; i < numPages+1; i++ {
+		p := reader.Page(i)
+		// add fonts to map
+		for _, name := range p.Fonts() {
+			// only add the font if we don't already have it
+			if _, ok := fonts[name]; !ok {
+				f := p.Font(name)
+				fonts[name] = &f
+			}
+		}
+		text, err := p.GetPlainText(fonts)
+		if err != nil {
+			return nil, err
+		}
+
+		// add the document to the doc list
+		docs = append(docs, schema.Document{
+			PageContent: text,
+			Metadata: map[string]any{
+				"page":        i,
+				"total_pages": numPages,
+			},
+		})
+	}
+
+	return docs, nil
+}
+
+// LoadAndSplit reads pdf data from the io.Reader and splits it into multiple
+// documents using a text splitter.
+func (p PDF) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) {
+	docs, err := p.Load(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	return textsplitter.SplitDocuments(splitter, docs)
+}
diff --git a/documentloaders/pdf_test.go b/documentloaders/pdf_test.go
@@ -0,0 +1,139 @@
+package documentloaders
+
+import (
+	"context"
+	"os"
+	"testing"
+
+	"github.com/ledongthuc/pdf"
+	"github.com/stretchr/testify/assert"
+	"github.com/tmc/langchaingo/textsplitter"
+)
+
+func TestPDFLoader(t *testing.T) {
+	t.Parallel()
+
+	page1Content := " A Simple PDF File  This is a small demonstration .pdf file -  " +
+		"just for use in the Virtual Mechanics tutorials. More text. And more  text. And more " +
+		"text. And more text. And more text.  And more text. And more text. And more text. " +
+		"And more text. And more  text. And more text. Boring, zzzzz. And more text. And more " +
+		"text. And  more text. And more text. And more text. And more text. And more text.  " +
+		"And more text. And more text.  And more text. And more text. And more text. And more " +
+		"text. And more  text. And more text. And more text. Even more. Continued on page 2 ..."
+
+	page2Content := " Simple PDF File 2  ...continued from page 1. Yet more text. And more " +
+		"text. And more text.  And more text. And more text. And more text. And more text. And more " +
+		" text. Oh, how boring typing this stuff. But not as boring as watching  paint dry. And more " +
+		"text. And more text. And more text. And more text.  Boring.  More, a little more text. " +
+		"The end, and just as well. "
+
+	expectedResults := []struct {
+		content  string
+		metadata map[string]any
+	}{
+		{content: page1Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
+		{content: page2Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
+	}
+
+	t.Run("PDFLoad", func(t *testing.T) {
+		t.Parallel()
+		f, err := os.Open("./testdata/sample.pdf")
+		assert.NoError(t, err)
+		defer f.Close()
+		finfo, err := f.Stat()
+		assert.NoError(t, err)
+		p := NewPDF(f, finfo.Size())
+		docs, err := p.Load(context.Background())
+		assert.NoError(t, err)
+
+		assert.Len(t, docs, 2)
+
+		for r := range expectedResults {
+			assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
+			assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
+		}
+	})
+
+	t.Run("PDFLoadPassword", func(t *testing.T) {
+		t.Parallel()
+		f, err := os.Open("./testdata/sample_password.pdf")
+		assert.NoError(t, err)
+		defer f.Close()
+		finfo, err := f.Stat()
+		assert.NoError(t, err)
+		p := NewPDF(f, finfo.Size(), WithPassword("password"))
+		docs, err := p.Load(context.Background())
+		assert.NoError(t, err)
+
+		assert.Len(t, docs, 2)
+
+		for r := range expectedResults {
+			assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
+			assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
+		}
+	})
+
+	t.Run("PDFLoadPasswordWrong", func(t *testing.T) {
+		t.Parallel()
+		f, err := os.Open("./testdata/sample_password.pdf")
+		assert.NoError(t, err)
+		defer f.Close()
+		finfo, err := f.Stat()
+		assert.NoError(t, err)
+		p := NewPDF(f, finfo.Size(), WithPassword("password1"))
+		docs, err := p.Load(context.Background())
+		assert.Errorf(t, err, pdf.ErrInvalidPassword.Error())
+
+		assert.Len(t, docs, 0)
+	})
+}
+
+func TestPDFTextSplit(t *testing.T) {
+	t.Parallel()
+	page1_1Content := "A Simple PDF File  This is a small demonstration .pdf file -  " +
+		"just for use in the Virtual Mechanics tutorials. More text. And more  text. And more " +
+		"text. And more text. And more text.  And more text. And more text. And more text. And " +
+		"more text. And more  text. And more text. Boring, zzzzz. And more"
+	page1_2Content := "text. Boring, zzzzz. And more text. And more text. And  more text. And " +
+		"more text. And more text. And more text. And more text.  And more text. And more text.  And " +
+		"more text. And more text. And more text. And more text. And more  text. And more text. And " +
+		"more text. Even more. Continued on page 2 ..."
+
+	page2_1Content := "Simple PDF File 2  ...continued from page 1. Yet more text. And more text. " +
+		"And more text.  And more text. And more text. And more text. And more text. And more  text. " +
+		"Oh, how boring typing this stuff. But not as boring as watching  paint dry. And more text. " +
+		"And more text. And more text. And more"
+	page2_2Content := "text. And more text. And more text.  Boring.  More, a little more text. The end, and just as well."
+
+	expectedResults := []struct {
+		content  string
+		metadata map[string]any
+	}{
+		{content: page1_1Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
+		{content: page1_2Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
+		{content: page2_1Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
+		{content: page2_2Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
+	}
+
+	t.Run("PDFTextSplit", func(t *testing.T) {
+		t.Parallel()
+		f, err := os.Open("./testdata/sample.pdf")
+		assert.NoError(t, err)
+		defer f.Close()
+		finfo, err := f.Stat()
+		assert.NoError(t, err)
+		p := NewPDF(f, finfo.Size())
+		split := textsplitter.NewRecursiveCharacter()
+		split.ChunkSize = 300
+		split.ChunkOverlap = 30
+		docs, err := p.LoadAndSplit(context.Background(), split)
+		assert.NoError(t, err)
+
+		assert.Len(t, docs, 4)
+
+		for r := range expectedResults {
+			assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
+			assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
+		}
+	})
+}
diff --git a/documentloaders/testdata/sample.pdf b/documentloaders/testdata/sample.pdf
diff --git a/documentloaders/testdata/sample_password.pdf b/documentloaders/testdata/sample_password.pdf
diff --git a/go.mod b/go.mod
@@ -71,6 +71,7 @@ require (
 	github.com/go-sql-driver/mysql v1.7.1
 	github.com/google/go-cmp v0.5.9
 	github.com/jackc/pgx/v5 v5.4.1
+	github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
 	github.com/mattn/go-sqlite3 v1.14.17
 	github.com/microcosm-cc/bluemonday v1.0.24
 	github.com/pinecone-io/go-pinecone v0.3.0

diff --git a/go.sum b/go.sum
@@ -257,6 +257,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
 github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
 github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=