-
-
Notifications
You must be signed in to change notification settings - Fork 649
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #188 from mplachter/add-pdf-document-loader
documentloaders: add pdf documentloader
- Loading branch information
Showing
7 changed files
with
256 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ jobs: | |
- name: golangci-lint | ||
uses: golangci/[email protected] | ||
with: | ||
args: --timeout=2m | ||
args: --timeout=4m | ||
build-examples: | ||
runs-on: ubuntu-latest | ||
steps: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
package documentloaders | ||
|
||
import ( | ||
"context" | ||
"io" | ||
|
||
"github.com/ledongthuc/pdf" | ||
"github.com/tmc/langchaingo/schema" | ||
"github.com/tmc/langchaingo/textsplitter" | ||
) | ||
|
||
// PDF loads text data from an io.Reader. | ||
type PDF struct { | ||
r io.ReaderAt | ||
s int64 | ||
password string | ||
} | ||
|
||
var _ Loader = PDF{} | ||
|
||
// PDFOptions are options for the PDF loader. | ||
type PDFOptions func(pdf *PDF) | ||
|
||
// WithPassword sets the password for the PDF. | ||
func WithPassword(password string) PDFOptions { | ||
return func(pdf *PDF) { | ||
pdf.password = password | ||
} | ||
} | ||
|
||
// NewText creates a new text loader with an io.Reader. | ||
func NewPDF(r io.ReaderAt, size int64, opts ...PDFOptions) PDF { | ||
pdf := PDF{ | ||
r: r, | ||
s: size, | ||
} | ||
for _, opt := range opts { | ||
opt(&pdf) | ||
} | ||
return pdf | ||
} | ||
|
||
// getPassword returns the password for the PDF | ||
// it than clears the password on the struct so it can't be used again | ||
// if the password is cleared and tried to be used again it will fail. | ||
func (p *PDF) getPassword() string { | ||
pass := p.password | ||
p.password = "" | ||
return pass | ||
} | ||
|
||
// Load reads from the io.Reader for the PDF data and returns the documents with the data and with | ||
// metadata attached of the page number and total number of pages of the PDF. | ||
func (p PDF) Load(_ context.Context) ([]schema.Document, error) { | ||
var reader *pdf.Reader | ||
var err error | ||
|
||
if p.password != "" { | ||
reader, err = pdf.NewReaderEncrypted(p.r, p.s, p.getPassword) | ||
if err != nil { | ||
return nil, err | ||
} | ||
} else { | ||
reader, err = pdf.NewReader(p.r, p.s) | ||
if err != nil { | ||
return nil, err | ||
} | ||
} | ||
|
||
numPages := reader.NumPage() | ||
|
||
docs := []schema.Document{} | ||
|
||
// fonts to be used when getting plain text from pages | ||
fonts := make(map[string]*pdf.Font) | ||
for i := 1; i < numPages+1; i++ { | ||
p := reader.Page(i) | ||
// add fonts to map | ||
for _, name := range p.Fonts() { | ||
// only add the font if we don't already have it | ||
if _, ok := fonts[name]; !ok { | ||
f := p.Font(name) | ||
fonts[name] = &f | ||
} | ||
} | ||
text, err := p.GetPlainText(fonts) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// add the document to the doc list | ||
docs = append(docs, schema.Document{ | ||
PageContent: text, | ||
Metadata: map[string]any{ | ||
"page": i, | ||
"total_pages": numPages, | ||
}, | ||
}) | ||
} | ||
|
||
return docs, nil | ||
} | ||
|
||
// LoadAndSplit reads pdf data from the io.Reader and splits it into multiple | ||
// documents using a text splitter. | ||
func (p PDF) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) { | ||
docs, err := p.Load(ctx) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return textsplitter.SplitDocuments(splitter, docs) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
package documentloaders | ||
|
||
import ( | ||
"context" | ||
"os" | ||
"testing" | ||
|
||
"github.com/ledongthuc/pdf" | ||
"github.com/stretchr/testify/assert" | ||
"github.com/tmc/langchaingo/textsplitter" | ||
) | ||
|
||
func TestPDFLoader(t *testing.T) { | ||
t.Parallel() | ||
|
||
page1Content := " A Simple PDF File This is a small demonstration .pdf file - " + | ||
"just for use in the Virtual Mechanics tutorials. More text. And more text. And more " + | ||
"text. And more text. And more text. And more text. And more text. And more text. " + | ||
"And more text. And more text. And more text. Boring, zzzzz. And more text. And more " + | ||
"text. And more text. And more text. And more text. And more text. And more text. " + | ||
"And more text. And more text. And more text. And more text. And more text. And more " + | ||
"text. And more text. And more text. And more text. Even more. Continued on page 2 ..." | ||
|
||
page2Content := " Simple PDF File 2 ...continued from page 1. Yet more text. And more " + | ||
"text. And more text. And more text. And more text. And more text. And more text. And more " + | ||
" text. Oh, how boring typing this stuff. But not as boring as watching paint dry. And more " + | ||
"text. And more text. And more text. And more text. Boring. More, a little more text. " + | ||
"The end, and just as well. " | ||
|
||
expectedResults := []struct { | ||
content string | ||
metadata map[string]any | ||
}{ | ||
{content: page1Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, | ||
{content: page2Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, | ||
} | ||
|
||
t.Run("PDFLoad", func(t *testing.T) { | ||
t.Parallel() | ||
f, err := os.Open("./testdata/sample.pdf") | ||
assert.NoError(t, err) | ||
defer f.Close() | ||
finfo, err := f.Stat() | ||
assert.NoError(t, err) | ||
p := NewPDF(f, finfo.Size()) | ||
docs, err := p.Load(context.Background()) | ||
assert.NoError(t, err) | ||
|
||
assert.Len(t, docs, 2) | ||
|
||
for r := range expectedResults { | ||
assert.Equal(t, expectedResults[r].content, docs[r].PageContent) | ||
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) | ||
} | ||
}) | ||
|
||
t.Run("PDFLoadPassword", func(t *testing.T) { | ||
t.Parallel() | ||
f, err := os.Open("./testdata/sample_password.pdf") | ||
assert.NoError(t, err) | ||
defer f.Close() | ||
finfo, err := f.Stat() | ||
assert.NoError(t, err) | ||
p := NewPDF(f, finfo.Size(), WithPassword("password")) | ||
docs, err := p.Load(context.Background()) | ||
assert.NoError(t, err) | ||
|
||
assert.Len(t, docs, 2) | ||
|
||
for r := range expectedResults { | ||
assert.Equal(t, expectedResults[r].content, docs[r].PageContent) | ||
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) | ||
} | ||
}) | ||
|
||
t.Run("PDFLoadPasswordWrong", func(t *testing.T) { | ||
t.Parallel() | ||
f, err := os.Open("./testdata/sample_password.pdf") | ||
assert.NoError(t, err) | ||
defer f.Close() | ||
finfo, err := f.Stat() | ||
assert.NoError(t, err) | ||
p := NewPDF(f, finfo.Size(), WithPassword("password1")) | ||
docs, err := p.Load(context.Background()) | ||
assert.Errorf(t, err, pdf.ErrInvalidPassword.Error()) | ||
|
||
assert.Len(t, docs, 0) | ||
}) | ||
} | ||
|
||
func TestPDFTextSplit(t *testing.T) { | ||
t.Parallel() | ||
page1_1Content := "A Simple PDF File This is a small demonstration .pdf file - " + | ||
"just for use in the Virtual Mechanics tutorials. More text. And more text. And more " + | ||
"text. And more text. And more text. And more text. And more text. And more text. And " + | ||
"more text. And more text. And more text. Boring, zzzzz. And more" | ||
page1_2Content := "text. Boring, zzzzz. And more text. And more text. And more text. And " + | ||
"more text. And more text. And more text. And more text. And more text. And more text. And " + | ||
"more text. And more text. And more text. And more text. And more text. And more text. And " + | ||
"more text. Even more. Continued on page 2 ..." | ||
|
||
page2_1Content := "Simple PDF File 2 ...continued from page 1. Yet more text. And more text. " + | ||
"And more text. And more text. And more text. And more text. And more text. And more text. " + | ||
"Oh, how boring typing this stuff. But not as boring as watching paint dry. And more text. " + | ||
"And more text. And more text. And more" | ||
page2_2Content := "text. And more text. And more text. Boring. More, a little more text. The end, and just as well." | ||
|
||
expectedResults := []struct { | ||
content string | ||
metadata map[string]any | ||
}{ | ||
{content: page1_1Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, | ||
{content: page1_2Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, | ||
{content: page2_1Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, | ||
{content: page2_2Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, | ||
} | ||
|
||
t.Run("PDFTextSplit", func(t *testing.T) { | ||
t.Parallel() | ||
f, err := os.Open("./testdata/sample.pdf") | ||
assert.NoError(t, err) | ||
defer f.Close() | ||
finfo, err := f.Stat() | ||
assert.NoError(t, err) | ||
p := NewPDF(f, finfo.Size()) | ||
split := textsplitter.NewRecursiveCharacter() | ||
split.ChunkSize = 300 | ||
split.ChunkOverlap = 30 | ||
docs, err := p.LoadAndSplit(context.Background(), split) | ||
assert.NoError(t, err) | ||
|
||
assert.Len(t, docs, 4) | ||
|
||
for r := range expectedResults { | ||
assert.Equal(t, expectedResults[r].content, docs[r].PageContent) | ||
assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) | ||
} | ||
}) | ||
} |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters