Skip to content

Commit

Permalink
Merge pull request #188 from mplachter/add-pdf-document-loader
Browse files Browse the repository at this point in the history
documentloaders: add pdf documentloader
  • Loading branch information
tmc authored Jul 13, 2023
2 parents b593b11 + c27d44f commit dcf7ecd
Show file tree
Hide file tree
Showing 7 changed files with 256 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- name: golangci-lint
uses: golangci/[email protected]
with:
args: --timeout=2m
args: --timeout=4m
build-examples:
runs-on: ubuntu-latest
steps:
Expand Down
113 changes: 113 additions & 0 deletions documentloaders/pdf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package documentloaders

import (
"context"
"io"

"github.com/ledongthuc/pdf"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/textsplitter"
)

// PDF loads text data from an io.Reader.
type PDF struct {
r io.ReaderAt
s int64
password string
}

var _ Loader = PDF{}

// PDFOptions are options for the PDF loader.
type PDFOptions func(pdf *PDF)

// WithPassword sets the password for the PDF.
func WithPassword(password string) PDFOptions {
return func(pdf *PDF) {
pdf.password = password
}
}

// NewText creates a new text loader with an io.Reader.
func NewPDF(r io.ReaderAt, size int64, opts ...PDFOptions) PDF {
pdf := PDF{
r: r,
s: size,
}
for _, opt := range opts {
opt(&pdf)
}
return pdf
}

// getPassword returns the password for the PDF
// it than clears the password on the struct so it can't be used again
// if the password is cleared and tried to be used again it will fail.
func (p *PDF) getPassword() string {
pass := p.password
p.password = ""
return pass
}

// Load reads from the io.Reader for the PDF data and returns the documents with the data and with
// metadata attached of the page number and total number of pages of the PDF.
func (p PDF) Load(_ context.Context) ([]schema.Document, error) {
var reader *pdf.Reader
var err error

if p.password != "" {
reader, err = pdf.NewReaderEncrypted(p.r, p.s, p.getPassword)
if err != nil {
return nil, err
}
} else {
reader, err = pdf.NewReader(p.r, p.s)
if err != nil {
return nil, err
}
}

numPages := reader.NumPage()

docs := []schema.Document{}

// fonts to be used when getting plain text from pages
fonts := make(map[string]*pdf.Font)
for i := 1; i < numPages+1; i++ {
p := reader.Page(i)
// add fonts to map
for _, name := range p.Fonts() {
// only add the font if we don't already have it
if _, ok := fonts[name]; !ok {
f := p.Font(name)
fonts[name] = &f
}
}
text, err := p.GetPlainText(fonts)
if err != nil {
return nil, err
}

// add the document to the doc list
docs = append(docs, schema.Document{
PageContent: text,
Metadata: map[string]any{
"page": i,
"total_pages": numPages,
},
})
}

return docs, nil
}

// LoadAndSplit reads pdf data from the io.Reader and splits it into multiple
// documents using a text splitter.
func (p PDF) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) {
docs, err := p.Load(ctx)
if err != nil {
return nil, err
}

return textsplitter.SplitDocuments(splitter, docs)
}
139 changes: 139 additions & 0 deletions documentloaders/pdf_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package documentloaders

import (
"context"
"os"
"testing"

"github.com/ledongthuc/pdf"
"github.com/stretchr/testify/assert"
"github.com/tmc/langchaingo/textsplitter"
)

func TestPDFLoader(t *testing.T) {
t.Parallel()

page1Content := " A Simple PDF File This is a small demonstration .pdf file - " +
"just for use in the Virtual Mechanics tutorials. More text. And more text. And more " +
"text. And more text. And more text. And more text. And more text. And more text. " +
"And more text. And more text. And more text. Boring, zzzzz. And more text. And more " +
"text. And more text. And more text. And more text. And more text. And more text. " +
"And more text. And more text. And more text. And more text. And more text. And more " +
"text. And more text. And more text. And more text. Even more. Continued on page 2 ..."

page2Content := " Simple PDF File 2 ...continued from page 1. Yet more text. And more " +
"text. And more text. And more text. And more text. And more text. And more text. And more " +
" text. Oh, how boring typing this stuff. But not as boring as watching paint dry. And more " +
"text. And more text. And more text. And more text. Boring. More, a little more text. " +
"The end, and just as well. "

expectedResults := []struct {
content string
metadata map[string]any
}{
{content: page1Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
{content: page2Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
}

t.Run("PDFLoad", func(t *testing.T) {
t.Parallel()
f, err := os.Open("./testdata/sample.pdf")
assert.NoError(t, err)
defer f.Close()
finfo, err := f.Stat()
assert.NoError(t, err)
p := NewPDF(f, finfo.Size())
docs, err := p.Load(context.Background())
assert.NoError(t, err)

assert.Len(t, docs, 2)

for r := range expectedResults {
assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
}
})

t.Run("PDFLoadPassword", func(t *testing.T) {
t.Parallel()
f, err := os.Open("./testdata/sample_password.pdf")
assert.NoError(t, err)
defer f.Close()
finfo, err := f.Stat()
assert.NoError(t, err)
p := NewPDF(f, finfo.Size(), WithPassword("password"))
docs, err := p.Load(context.Background())
assert.NoError(t, err)

assert.Len(t, docs, 2)

for r := range expectedResults {
assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
}
})

t.Run("PDFLoadPasswordWrong", func(t *testing.T) {
t.Parallel()
f, err := os.Open("./testdata/sample_password.pdf")
assert.NoError(t, err)
defer f.Close()
finfo, err := f.Stat()
assert.NoError(t, err)
p := NewPDF(f, finfo.Size(), WithPassword("password1"))
docs, err := p.Load(context.Background())
assert.Errorf(t, err, pdf.ErrInvalidPassword.Error())

assert.Len(t, docs, 0)
})
}

func TestPDFTextSplit(t *testing.T) {
t.Parallel()
page1_1Content := "A Simple PDF File This is a small demonstration .pdf file - " +
"just for use in the Virtual Mechanics tutorials. More text. And more text. And more " +
"text. And more text. And more text. And more text. And more text. And more text. And " +
"more text. And more text. And more text. Boring, zzzzz. And more"
page1_2Content := "text. Boring, zzzzz. And more text. And more text. And more text. And " +
"more text. And more text. And more text. And more text. And more text. And more text. And " +
"more text. And more text. And more text. And more text. And more text. And more text. And " +
"more text. Even more. Continued on page 2 ..."

page2_1Content := "Simple PDF File 2 ...continued from page 1. Yet more text. And more text. " +
"And more text. And more text. And more text. And more text. And more text. And more text. " +
"Oh, how boring typing this stuff. But not as boring as watching paint dry. And more text. " +
"And more text. And more text. And more"
page2_2Content := "text. And more text. And more text. Boring. More, a little more text. The end, and just as well."

expectedResults := []struct {
content string
metadata map[string]any
}{
{content: page1_1Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
{content: page1_2Content, metadata: map[string]any{"page": 1, "total_pages": 2}},
{content: page2_1Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
{content: page2_2Content, metadata: map[string]any{"page": 2, "total_pages": 2}},
}

t.Run("PDFTextSplit", func(t *testing.T) {
t.Parallel()
f, err := os.Open("./testdata/sample.pdf")
assert.NoError(t, err)
defer f.Close()
finfo, err := f.Stat()
assert.NoError(t, err)
p := NewPDF(f, finfo.Size())
split := textsplitter.NewRecursiveCharacter()
split.ChunkSize = 300
split.ChunkOverlap = 30
docs, err := p.LoadAndSplit(context.Background(), split)
assert.NoError(t, err)

assert.Len(t, docs, 4)

for r := range expectedResults {
assert.Equal(t, expectedResults[r].content, docs[r].PageContent)
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata)
}
})
}
Binary file added documentloaders/testdata/sample.pdf
Binary file not shown.
Binary file added documentloaders/testdata/sample_password.pdf
Binary file not shown.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ require (
github.com/go-sql-driver/mysql v1.7.1
github.com/google/go-cmp v0.5.9
github.com/jackc/pgx/v5 v5.4.1
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80
github.com/mattn/go-sqlite3 v1.14.17
github.com/microcosm-cc/bluemonday v1.0.24
github.com/pinecone-io/go-pinecone v0.3.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
Expand Down

0 comments on commit dcf7ecd

Please sign in to comment.