From 9547d68c00471d3e0d387dc03d934e36266446cf Mon Sep 17 00:00:00 2001 From: Matthew Plachter Date: Tue, 11 Jul 2023 20:42:40 -0400 Subject: [PATCH 1/2] add pdf documentloader Update pdf_test.go fix assert order on test --- documentloaders/pdf.go | 113 +++++++++++++++ documentloaders/pdf_test.go | 139 +++++++++++++++++++ documentloaders/testdata/sample.pdf | Bin 0 -> 3028 bytes documentloaders/testdata/sample_password.pdf | Bin 0 -> 16479 bytes go.mod | 1 + go.sum | 2 + 6 files changed, 255 insertions(+) create mode 100644 documentloaders/pdf.go create mode 100644 documentloaders/pdf_test.go create mode 100644 documentloaders/testdata/sample.pdf create mode 100644 documentloaders/testdata/sample_password.pdf diff --git a/documentloaders/pdf.go b/documentloaders/pdf.go new file mode 100644 index 000000000..d9abd7eca --- /dev/null +++ b/documentloaders/pdf.go @@ -0,0 +1,113 @@ +package documentloaders + +import ( + "context" + "io" + + "github.com/ledongthuc/pdf" + "github.com/tmc/langchaingo/schema" + "github.com/tmc/langchaingo/textsplitter" +) + +// PDF loads text data from an io.Reader. +type PDF struct { + r io.ReaderAt + s int64 + password string +} + +var _ Loader = PDF{} + +// PDFOptions are options for the PDF loader. +type PDFOptions func(pdf *PDF) + +// WithPassword sets the password for the PDF. +func WithPassword(password string) PDFOptions { + return func(pdf *PDF) { + pdf.password = password + } +} + +// NewText creates a new text loader with an io.Reader. +func NewPDF(r io.ReaderAt, size int64, opts ...PDFOptions) PDF { + pdf := PDF{ + r: r, + s: size, + } + for _, opt := range opts { + opt(&pdf) + } + return pdf +} + +// getPassword returns the password for the PDF +// it than clears the password on the struct so it can't be used again +// if the password is cleared and tried to be used again it will fail. +func (p *PDF) getPassword() string { + pass := p.password + p.password = "" + return pass +} + +// Load reads from the io.Reader for the PDF data and returns the documents with the data and with +// metadata attached of the page number and total number of pages of the PDF. +func (p PDF) Load(_ context.Context) ([]schema.Document, error) { + var reader *pdf.Reader + var err error + + if p.password != "" { + reader, err = pdf.NewReaderEncrypted(p.r, p.s, p.getPassword) + if err != nil { + return nil, err + } + } else { + reader, err = pdf.NewReader(p.r, p.s) + if err != nil { + return nil, err + } + } + + numPages := reader.NumPage() + + docs := []schema.Document{} + + // fonts to be used when getting plain text from pages + fonts := make(map[string]*pdf.Font) + for i := 1; i < numPages+1; i++ { + p := reader.Page(i) + // add fonts to map + for _, name := range p.Fonts() { + // only add the font if we don't already have it + if _, ok := fonts[name]; !ok { + f := p.Font(name) + fonts[name] = &f + } + } + text, err := p.GetPlainText(fonts) + if err != nil { + return nil, err + } + + // add the document to the doc list + docs = append(docs, schema.Document{ + PageContent: text, + Metadata: map[string]any{ + "page": i, + "total_pages": numPages, + }, + }) + } + + return docs, nil +} + +// LoadAndSplit reads pdf data from the io.Reader and splits it into multiple +// documents using a text splitter. +func (p PDF) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) { + docs, err := p.Load(ctx) + if err != nil { + return nil, err + } + + return textsplitter.SplitDocuments(splitter, docs) +} diff --git a/documentloaders/pdf_test.go b/documentloaders/pdf_test.go new file mode 100644 index 000000000..9a56b27dd --- /dev/null +++ b/documentloaders/pdf_test.go @@ -0,0 +1,139 @@ +package documentloaders + +import ( + "context" + "os" + "testing" + + "github.com/ledongthuc/pdf" + "github.com/stretchr/testify/assert" + "github.com/tmc/langchaingo/textsplitter" +) + +func TestPDFLoader(t *testing.T) { + t.Parallel() + + page1Content := " A Simple PDF File This is a small demonstration .pdf file - " + + "just for use in the Virtual Mechanics tutorials. More text. And more text. And more " + + "text. And more text. And more text. And more text. And more text. And more text. " + + "And more text. And more text. And more text. Boring, zzzzz. And more text. And more " + + "text. And more text. And more text. And more text. And more text. And more text. " + + "And more text. And more text. And more text. And more text. And more text. And more " + + "text. And more text. And more text. And more text. Even more. Continued on page 2 ..." + + page2Content := " Simple PDF File 2 ...continued from page 1. Yet more text. And more " + + "text. And more text. And more text. And more text. And more text. And more text. And more " + + " text. Oh, how boring typing this stuff. But not as boring as watching paint dry. And more " + + "text. And more text. And more text. And more text. Boring. More, a little more text. " + + "The end, and just as well. " + + expectedResults := []struct { + content string + metadata map[string]any + }{ + {content: page1Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, + {content: page2Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, + } + + t.Run("PDFLoad", func(t *testing.T) { + t.Parallel() + f, err := os.Open("./testdata/sample.pdf") + assert.NoError(t, err) + defer f.Close() + finfo, err := f.Stat() + assert.NoError(t, err) + p := NewPDF(f, finfo.Size()) + docs, err := p.Load(context.Background()) + assert.NoError(t, err) + + assert.Len(t, docs, 2) + + for r := range expectedResults { + assert.Equal(t, expectedResults[r].content, docs[r].PageContent) + assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) + } + }) + + t.Run("PDFLoadPassword", func(t *testing.T) { + t.Parallel() + f, err := os.Open("./testdata/sample_password.pdf") + assert.NoError(t, err) + defer f.Close() + finfo, err := f.Stat() + assert.NoError(t, err) + p := NewPDF(f, finfo.Size(), WithPassword("password")) + docs, err := p.Load(context.Background()) + assert.NoError(t, err) + + assert.Len(t, docs, 2) + + for r := range expectedResults { + assert.Equal(t, expectedResults[r].content, docs[r].PageContent) + assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) + } + }) + + t.Run("PDFLoadPasswordWrong", func(t *testing.T) { + t.Parallel() + f, err := os.Open("./testdata/sample_password.pdf") + assert.NoError(t, err) + defer f.Close() + finfo, err := f.Stat() + assert.NoError(t, err) + p := NewPDF(f, finfo.Size(), WithPassword("password1")) + docs, err := p.Load(context.Background()) + assert.Errorf(t, err, pdf.ErrInvalidPassword.Error()) + + assert.Len(t, docs, 0) + }) +} + +func TestPDFTextSplit(t *testing.T) { + t.Parallel() + page1_1Content := "A Simple PDF File This is a small demonstration .pdf file - " + + "just for use in the Virtual Mechanics tutorials. More text. And more text. And more " + + "text. And more text. And more text. And more text. And more text. And more text. And " + + "more text. And more text. And more text. Boring, zzzzz. And more" + page1_2Content := "text. Boring, zzzzz. And more text. And more text. And more text. And " + + "more text. And more text. And more text. And more text. And more text. And more text. And " + + "more text. And more text. And more text. And more text. And more text. And more text. And " + + "more text. Even more. Continued on page 2 ..." + + page2_1Content := "Simple PDF File 2 ...continued from page 1. Yet more text. And more text. " + + "And more text. And more text. And more text. And more text. And more text. And more text. " + + "Oh, how boring typing this stuff. But not as boring as watching paint dry. And more text. " + + "And more text. And more text. And more" + page2_2Content := "text. And more text. And more text. Boring. More, a little more text. The end, and just as well." + + expectedResults := []struct { + content string + metadata map[string]any + }{ + {content: page1_1Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, + {content: page1_2Content, metadata: map[string]any{"page": 1, "total_pages": 2}}, + {content: page2_1Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, + {content: page2_2Content, metadata: map[string]any{"page": 2, "total_pages": 2}}, + } + + t.Run("PDFTextSplit", func(t *testing.T) { + t.Parallel() + f, err := os.Open("./testdata/sample.pdf") + assert.NoError(t, err) + defer f.Close() + finfo, err := f.Stat() + assert.NoError(t, err) + p := NewPDF(f, finfo.Size()) + split := textsplitter.NewRecursiveCharacter() + split.ChunkSize = 300 + split.ChunkOverlap = 30 + docs, err := p.LoadAndSplit(context.Background(), split) + assert.NoError(t, err) + + assert.Len(t, docs, 4) + + for r := range expectedResults { + assert.Equal(t, expectedResults[r].content, docs[r].PageContent) + assert.Equal(t, expectedResults[r].metadata, docs[r].Metadata) + } + }) +} diff --git a/documentloaders/testdata/sample.pdf b/documentloaders/testdata/sample.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dbf091df9a607221e000593a8b5a97b5ea5fb073 GIT binary patch literal 3028 zcmd5;+iK%D7``|79sZj_3mw^3d>n>>rft|$r=<Gl zWIx$CH11?D%do5oLHZ^AN9p^&qnnG-8;=ca>*%k)|M=6kY|A5;iigj(_3oW*IpgQ0 zB;?N?G}O?F~e-o&fdSbEveGxxNVs&T}_+wIC);wN|S3_`=^Ym z?y1Je_6W!5=Pa%0o_u4M!sh=IbybO>E+wq5{dR6;Rn+AKku*_{3aqswkCH}v ztJ}FLi^-kT6dU1Mb|uqH42vhacOeZu&Rl#HCGFr-xBWM-|+e^~95C3kuyCa8Nz&=d3 zPw9GoO7mhx4-J@*eqI7o0NLmbm9D2#M#EZ@Dl~~|vk9Y>(382@*~jiaPA^4vloGW^${BO z&|L0&$FyZ34q1)S0TXjiAeyzye;HJqPhX+oj`M@hIuz@m%ZWTgO?gR!qsqvQPqV zVBwTl{djT$Lm)?KJ&`!^p- zB?yU29^x`|s{JSof_pN9Oqel#YyZFw~B0kRS*9GB0?&&kJAg z<34|7m-_(#cV8b5!0fg%+X9aQc`Db0`!4$;o1mTB0`JJMabTnKqnZ}rgzd~^$`C_Q S>NZV0@_bPEqs!}&ZT$n`rwj1_ literal 0 HcmV?d00001 diff --git a/documentloaders/testdata/sample_password.pdf b/documentloaders/testdata/sample_password.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3d296af53024ab03d12675ffceb9520ae90806ae GIT binary patch literal 16479 zcmcJ$1yo(zvNnjj1`h!??y_-r4elCzLgMEKFiTq6 zxPY9%cN-%YkT}T1-V_92mIK+DyI26&IJp6Wf=JFTP9P&&q?8Xj-<^NvakY;(eA@_l zz&Ydjh)VNd7$*o7!V=e`^t|Aep0CL5S7$ge+29jgqG5LSu|E+3zUQQ`M9!D!>No)I z$eV0>WG+=BdLVLroPcr%AZm4d{wnsPo+&YclROmNz;h&U*s7@LNb&LnO6NSS-JYUl z5_)tu{Y-;=nbu1P#VhzI4+!iTK1tPnP(LVGmYF{t?OK=D2G%;Yb})3%CCXi4326AnlHy|=e$#SXL) z3{{};d97fW(cUBH!ObBH)K7ormxvKNAaBoqqDbIIT>oLGR*_)_>TmZUYvEq^d zW;Kw93*ZBQS`4QE`VavFiQ9#@whlIB-K_U)rlRb z_`T1X>dMj#upy6|*5eg^<-^l+mb&&qq(3@l#V&dHsF6UTQ1YXIUxnrI7=vq{nCf9#&c<^dsuIBUf=x*al^ppnU+-Yq?}OHPMVyjk zejNoUggWZP+q}-wh89O87P2K$mBsCJx0u4c=5eNw>>QTp=DUOl{zdUlzSqLA{# z7fA4x^i+tVPW1ckYw1A21$Bo8xd1Yi6*&I3D4A5)Q8%}d6zsvahQW#)%?!wx&pS!3faem`-fP{&bn}O?!=#d{{1K&>n2tsa_2?)TuJ3mGtcRiRu0suJ_uc+P8b02GY)|E!BtcyKkae zpmkSU5%I|cQ$}$?cjT!wKda3SRY6T*sg(Q-KH8ynbf*MU+EhF-aZR<8c~ZgWh^4uG zzc)dcpGmKr-(o7k{W^2v6V5se(tej9{HQ^;jlBk(WLua1mTwkP`9URvk)7J7Ix)%` ziD@Hq6G%WxKe+MK%T{+;^NU(d^P@M~fiLxJHQk~35LHN_(rMcZedlkepI?V%Nig=p z6qTLnWZ=D+ND^7qaK%-n2khM5e7j>yV!xF+5j`VyP9BfmRCr^R>MbF?_IoB*Q+L8Z zU~|7|cJLg|He1g_w{Q|!X9~kox#%#4@O@HhXFs+@C@lXJ4K$+zv8Vd;PU-n&JB(xL zSIEKLD1_bzk$fY`-r1x_Q~#~q3lUP0&yz}MUa(W-c>TNQ-B*LftQOz+FsdZ1-hWKg z;Qy_U8vO*XQaF^nYJupd<-~T}GEhR+p*)4CFg5i%e&NtHzhn|C%3N4HEfTq60t*8j z`yFBj5u{sR@L5Z)aQ)T2>&|E)XV;@{>GX^JZKQ2ZfO{BfC-4gIE>yBuOAmuXIq^s0 zX(C?ZDCX?1CVRuxz8>BFnKFYWywWl}?Fe$L@|==H0_tkDZ(T9iGMwki=bxp6p+5`1 z)T~8`d~h3-bL+caR{cqUJE3-KyEo7$%p1VG4gVf7YeVM#LeOkzmqqR!Mz(Uik99K~ zlE1rJf|U(o3GtRgQBkRl$3!A7NeZO`jT-)5Drt0({zkmXsnb^ZYEzNp_iD^Aos}hd z@(&}*Evyc(J$2*5yp31i)A3Z2jbn?YO`;5uE@WNHTS$9}?6B8EG`Ff%q9_U7u8;!P zsOCFPL)9V*m^dL5{Y4}wQ1qRrJ+1xp36&im8mE1x0MemlMy!H});88!=MFyImUZ{y z7A>~0=z}nn>8KysycGPr4F+mrc6}q@v%U&3-*;)V!ye@SHeM+{;sd6~;gfixqh{W^ zZ2zWXDxTtdr(!L`1+4|5l5>ksaIFhq!%kerO7FJR5UjgOWh!_-fiK#4KNF%I1{dG+ebqmM$swpn^KXGi*`sj$ZZC;u0k@ zhE-f5${Oo0F=@0}rv>&eVuy(k!UUQh(yy~(=z+GsZbFTsukmnu`JV`e7orQtCb>6d zpV~0*rcI^Ab*w<7HdknKU&@x4L@P>KJ58{^OrY!2sRcXTdjFd2nWYk`=d-F!9yaWZ zis1qvE^lWu`*1^{u%Kqy$b1+?IKp2i{*4ts6|jWMYje8iQC(6vxk7MvV(gz-;Dayu zF{X2QgBv0;nAz-l-`9V}pw8tkc0}lWeZBW_EX9Ra+J?p8bn?^!;ll#Mnb-bpV3|z2 zmF4+yo}LbD`*-^5@=a$xUSr&=3a^XoeO!`AM1*EIm-|R$9Qk_Wh$=5R($Rx>YI2+i zuC5A92GHI52I@GjfY2;m?-hrnj0u1*D&g)Un<+g#rq%uSJ2C}{&jM)fQDkp2brr-~ z4L~j|5^PUQ)>Gg_cHur)~9mLtG ze1k8@iK$J(l6;9O71z1@zE;kbPCR!$EpMKCTF((G=t68#5<9Fj_oEQWz^1Yztdyh_ zt3;wBi505itxF>qNhObPhOhm~qpV%6;0R)0JbGnZEF@pi6-{EovF0^b1RV*B#L*;T5}LDTccB| zZGTEbpcT`NpDDe~{7{+}khfZhDzjTX0Ivdr%lz{B8c!FaIVJO*hp`kVDvKf)yl^Sd zx@oce>(MYE%}<2nQ%1wrXhF*}BAck9^DaYAChYl_0Q!mHReWJlpIer~8*Uyx{d|gz ztjQkUa#uw%M@IoSjZW2XbyO3tm;b>`8TBcnzjxveQiiPGlvT&VSCruwoQp9hk!RSL= z(!eI?8>p7e?Qm7B=+EVFEgx6vxMqGt$Lgcoah_WzKglc{QDV6vIM{#7#8OX4BFDW@ zl%d2K$=WJ{-y+B)fE7?sKd(x_^q=q);p85@$R=AYb&HfS+(2tV3OOsR^NngBsElze zu@h+>C!BFz?GQVClH9t%mrG3DjS2M_I)@-ED6}z-v`>78Z{M^il4A&~*M5EZiN8gb`W;mr=Pokj4 z`=TBJ`%5&>!AR>aC+h+9-WCSOO+xr)qp_Kio}#~To^h>w*l!laz0-Ie7*81Nd+_aX z#}%?X_28o6^1#Fe4fS(8B7(&Z6?2?nn$*D?!l6=miMPedpk*c%nswA1t1VrW)Udv( z2li(&v7j*Wg4Cc<6p4&G^9!hsMxEIV$x764&1;evp&%MK6OZljbD|AKh;O#Nq+UY# zmZ9wN-?1hmxjf(ZKMu|m(Z8DoZU!P}Y0f105#*X-Ww9%P+WV#lSG6}ZSB`k26J@Cl zP?~-e;}EO@J6P7Y5?v=~3|7OP@l|N&CgY~C?NIUJ3>Sk{T+|jgkmCD(xw3Y-vHZID zfjCfm-u82S?A-@kjz0Zk9*h3JZ^K`Ax38OG@J3ihOia|s8Dt80-N&i`KK!+B=J~g6 z?f-3r#rnr?`+teBY@0XrgOkiJ;&q^Y(jVFVc$nTBEfQbP6oR|cNB2HePHhmnzw9oj zW_`1Z704@(r%^4AvN;s(kBEHv+!eBgU(I2J4cSCZ0@voJ3>zIDblqpzQ+qU|ytVy- zBIZ~Co(v~@Fu%pLj&UINdTHQ7o-DYlL_)h^F(jRR&r!h4DjOd@xQf!&7vb-w;{G=~{6C49fUgk|I7Iqq$nSK$ z2B#MOZTxF|_n(UW5&ki&x*EIu5w5B^xq|-G`1P;m&qk5gLzG?`Wa9>Mu{1IIGiDSA zIh#0HI=I+7A%Wx5KWrdj2To2{+JSYEH!@MNw>7f+7e~_4$=OBB!pI50#tM!Wjs8IZ zS;3U1rKyX>>ua-eaU;Fj_wPU8762O?+iQTVMJA>h*7&|I+F&=^QL9e`xVaziRdupA{_a^v$Uz!pX>I{-1abkGRb4=~8UP+{0JGMgY$Og=aH`;yEe+u20MF!R{)d$T z?Cii-_we|W0=~ZKA6kLG!5~%u@XrLuzp@X1F#ht`Ka*ZyjXAhj*#DVw0J5;XQvcIO z{D03mI0jq4NE?W-7FMWEA=1;DeUKN>&WX$UO*PT=)JPVPX4MRHA1aZsP?KwlLwko6 zMS9FF`B>poXybU^TPHPByz|lQJWiP{x6|MWA)W@hF&YEwLuCBgxsLQ_Wo#1uf+#Dt zT;;d8VYkUp!6p8A%h98y2e zZE*{xZfX>#j0>y23!1~K`P#b5FX@Vp--uQ1?#;MSWZ?tJdGFQJZ)E%!%Dx}X_M3qg zKB-E9vW*gNapeO6!w!ASaKu6@#n6-?E`w`llY82BU2KSjI~p$4G7!#|7uV7qph6v{ z_oZbKdL7$0dPJyuc$?8yGf2rr%FXAY>ic}X?FjW^ z*niTp^l+zzH{ohy{syc&R*G!KYcE+uWqMXY(6KqtEz#$Q_Sh)lcNjv*j&nxXVc&q- z@vlFBZ)Ew?l3wLoNP)U=Ph*%GTNtxmaR9P*`9=nJyY3`34$d; zFt(y{Y`At)u`ab(^=g%oj~?}^DVdjRqI<)T(S#@rBaPYD9PWvxZ<(#tkAjhJVU_OPT=t)28s#ix3`_ z`Js6ygyouYCOjhqhvxVL1{%#M<6Z*NDEtQ}A6udkZ=q3(`!~Y`$oXwa9qoSPpBy&4 zhBwGEFzPkRKNSWV=X07BDk*IbC15Nd_e^^YGDToXHIppS)8PD)Q1Qj$nl9bZ{&m^y z7||>x#*`DdSG*?}|G@5j0H^=PF?HTVw3Cfj9C3(;c-e$V27`)z!vap@MT2+N^{ z8rSjrbdtfh0w?4;qz-UCxE}k-0-+1*<78yZzZ?2bGilp1UA@_c49YK!?Ve&Q(O!Bc zpX$D>$k1nnmXaK#bvqBj5wi`YaC`BP>(-^8mzS(|DGtQG+>5R>5bDFQZ^kXTEFbN_ z&kSU^)|oI_h;7$}BYkQqQ5K)^G?$Z|noUHMpN1ilCj31^tEQ$!i4bX~q@p{k#gmQ; zO|)Fgta#6UK<*h^Ff|Nd7q+u0Ix|V^jicy5$RaT04`FV1^7s4!^g2U}!5mIsOdt;V z!fo$Dm#<>~Mo>ys>wTczlK#(C{XuBI$MzAG%cl(a^jTcrC}?KcbZdouUbL=cJFE$qXEa#4Ft>R1z3C&SGd9W$w9j_Q;c$s!?7!9tgufIWpb?(yYR3=*Zw6#^HNOs&UN=>Ko8B3bu3e zOULpw4Po)oRcDXOzjGjWWCQY-1&dwmO+?naDf2CYyt>5k4%z*dgLhFb=WxcAap(hN+3F^3p2@rWjAkAf8j6=C1i!EeBI9BRQ5i(+o0*~PbK$HQw71H! zTCt=)cr(q={dINotvSH~@&x&bHOynqCGv(U{UW~?AR z%%w5v7JvPIVaQmE8rH;WRe;6;@iyhD#atNyDn>Uv-1(ZauxF)qgomvs$8$!g1|v9m zl_Dg2)HrsurWr)GXOxOtbZ%Y9+?%`@)zgtS2R{>a&&yt>_6g}nF}zGliT`5&hPt(U zyFv!B&O@Fw^j@`a?@hP>-9Sh@vvV2Qu;dg6TIo&&abM$OoswJWWQ8-1O zlSSvzn^qo*>c$hhW2{ZlkI5Z3oq}o})qbN~ygX>0Jp6H@1JbjHYwu$}Hj!!+5hN$d zzyi@e1~tkjuU4emMYlgw@~)%u?ZF=R$_5{?UN1$ta~B{i>Ae-%a!L507`BSWIRP4rJ9(d$yPjW&40``-BWxn+SC3|km2m7!~iqHapf zYX|}VT1v{i?lGL=dbJVnkN8qHLAT_whkBN_Gw~*Ojil;s*QSARDU&c|B^u|&Oj2#q zcI4j&Ga>g?UoaK6eW2R5zk$@KDXJ81>DxzwWeI>)Uex8TYyttCNnK6@&(H64m4PZ_ zA@fhQ?l{{`q|Pf2d$X3=1T{yrED|-78eP>hUq;-WO<*sG+l00SC;rmliWtIc?A4jJgQUST_PU2Y`!f2C^Qf1n2OugAV@mN z5kLpbi=!Qe8ql{l`9orN8Nn1F)A-o^qs@{U~~dEK@H{84{`01 zbtG*%nM~x>N28t0X(%KlO-@?GW6Ab3Y=|mj68E=5u|mZb&3>IL<8_50ZnH9RAeIpI%#T%SC*KgMr+0%nfgY(9I1Sp?$kD6({z`5EoVy?r9f(@BCb zY*o_DUAYKM4B&_>6HU6Td~YSJD3zTfb&qgSe4KZLktf(IBdz9F*Js=~DH0N)Z6BPM z*3Q&^K1dMK(jw@}RD9>Y`+npXZ36{N{;)-|zdHnTW5^9`B0sdMxx)E68qS)y%_!FrPR^vN>x)?OAn zN8z*Ar#mTS+aIcxBGeo9L>5&)EJGAORC1FRFk=-lrDABV4yEyQThIZ{HJ*NEYO#dO zN?UbxMzaqEx4grx$?C*dcOk$I5&oU#VVBc>#4UkB&L`{*pF%C|4-C_jqYFRQhur_f z4VMzUr&^{DnfjfdwDFv`0Nc$S6iLa2+{i?SyU(mqKExApQv<y%3O3bfY_TZBa9aNQgKpP^8MHPs||Gd^_f@q#Q>Y3c0{+5X-}`*Qz;t9Wy}5 z+h5&Pmhq_WTZ(I`x6|S86cd0fm|}_ccF%1tb)`rzd~TlWXugqc(K=n_!;vW6aAla7 z$4*J3C(La}ZC4Yre%pNaRnjCy5ww{ZWL^{I+_#$PgSn7Uj^;)ULYhl;Y<@&2efG&6 z8o7J_Y{==pum#JUd&}PWlv6w{HL>5kNCUY$4(CA!f(nDN}g!!>n9Ubj#ya` zBwQC+h7gMvC(p)WsYjlS@M_KI48HIcdJL)Z7l|u2+hg7Gg<7c#24)SVc2X_Z(^g_C z>mAte{#a8OT&5$??6u58Tur7CF!JhgIhZiS@tTs(t!`}DYMZ z_e8+{7M5*fDT~>rMkKBna#nWD_AQ(YbLv&?d z7{Se?p>k+O7i9{F&W@F)dyO6)`E01~sW7Y&uOHu2>v+Fa>z!L@gEo?)c zDz|OrD_M_wigug?W0>bkrLns1n-?_I)XQ+Q3Y|$yWphNRm}|+m@phRwI+)uFFZF+q zv|fu5enwa-s>(24oGR^RsD35}uJ38)$yrEKq~7gm)N66_F`^&-;POpUbQQmcWKzdNoR#~@VqY?5p1fkmfTQNMG=znLpq7QhfPU+!mb`kRo#=VOqG!x7$Wkc8;@~0 zHsahjAzXaR40ORXLcugF+ycEQ8gHNHo~925_qxh zvJLN$B8)=c-l{s>IeiQa`}pM^qQ!iShSSZkl!cAk%&-Tc_&LQYexW=hHvR35lDa?1 zs!-ub4k+5(&wV}zH3XTpR|oJ$>1Sy5c#D+Ldu^O1!A5O(mI_L;LOH0ZPx$pMtv>VV zsRMayCHOF8GBl?EYE$YJ@Ei@ z&PhfObtWZ}6iOV%MNYtg!1hQbOrw+T*1^nM*hn{mZi}chfYVG4hS93nAN0OBtoRZ_ z4NY^*s7UeIDzhp6kXed34o6nDW=PGjFVdAQ^qtFiW^x>}V?5?KL6Ws&NmrdM6P) zZM^5D2g#QeH30RTd$?_!ltsB=a`F(j`IO<<^~k2x5XqZGdDfh7dc(h-^adK;flY3w z;!~~qr?=j9ysB}bI8*bB<+@BC^IHgpIF%^2xr)4wWKdD!2h6R~}5|F-xbeE0MW=rh>99y?AJkzjq zM_u(nJuXm+#w=}`6KF&J`t9ZfM!ZJ>u$5hxjHJDRB%utQNZKWDK`M;Nf#hn9_3@@R z{VrQG`@zCyCDw^ua!ZqW3#J605~5(#rhn!{V+m!LAjL+mFlpXSlyCet4{*DJt76`!A7sSkN&ptNk)vH6>uBmt+YT&TQZ++-CNj`(3$CVVaoDF*yvTvf?mSGV$LzN}9H4M{!p{tO4Ufw}mP^CTRC7C(9w6U*%-^ zp-Tdl#Q6B*2p%hfm%|#)*M2rbEMKvOxClyB6^nFJ89)3!gz;aApa8eCy2O8Zm0 zBF<15m*;0T97C(w0;3rwB-NN#Pr*}D71T+@a-_ql8Izv|r~lNRArnb1BiW{M#&U+l zvAAeKrfIFPTMI=G>!Y4Wy+!2hA*(HJYwVAzwhx0}KgK8+ z*!evtuZlSdSJ)DgS-tQq85z(4G8i(%-&-rj{}Lo>ot>`bNtSwKFz?OtEY;bk93-Lm z`N>AmkC55Red^$e^Qp&X3`HFMc8yv~+c*$q*#XW0F&6;`5$dZt(h^o*gX8S&Ijy)^ z$(Oa0eiXXyWMW@dtVyLQt!OJ~vN$6JEH!?-Vu<7CG?U5`PCM(v-%+ z+TYS!@2#?YO)!;NM(xv`Gp-UHCOwkJ5UL0!+I<8gBI&v5^XiHmoF``zgSUEt(3-ZF z(HD}<$^&V-JrVxsl^r#~<)68(d$ixV|6)ef`cr$*1?!Sd(o}w=OYB_yxQ>Sx&-H#hiqi{@)Y4`F&w_={%et@68ZoZ)w zy&zuTc`N?K+M@TF*UQ?GJD_j*IdYvSY%luZ=d(vrw@mu*nsZupGOFk1(!>fkb{3i) z_LZ)H0I6h*CNGNRYQi#~Luvah2_(X4odKP_!`MvY7$n4ejF?z}5+xLw0M=J}2mU4n zg;{@jp?0SS%j}Z&chUCdCn7nK@9xqK>5`MgX~QKQ4m<=^lPxtlLcWMBD2_u7J@q9U ze(yi=DGWN2Q`=I(=G>nt%57iVw#3aM+vnV6TIeR6<)Xt_^R6e^OeBW&|CpF`gu zgn$+qWlIzg8h%f2rSEZzGT%Nw9(PuGi35<80l<7v-PGzm^_Es0ne5 zk|z8$qt`V!OnQ#V$+YhC64Ue;$_O~WQ#5MOm5Ad*D-4LaylNGXqc&nZpwG`X)eEL9 zjMt-avabB$#CGS^qi1h|Uet>aVz+{W=-}L-C1;<+>`b}>-Q{xf`;lfNVT=`KV=ct< z)7hGyv(ZZ5)`NDov8X(oi1J))or-KqO2&43h__={Z3#?E4pzr(JP&y||FN3vhbG1B z&Uj){^v#u8v7V&k@|Wo9xiQ_`ek^jMxt}3vs^)A-H*wP|k=0D)d6=}xz5=c7iJt_2 zgykNst-tV+x4zf9(7@VvF%DboB>C($pG6-!N#0=^-`7$+dGe7+l(`SxHOWRY8GS*@ z>$;wF7BBVyCkFNCof%*$ei5H9Vj2p6$~y$;a$UV5>wEf^jXS#SbP;J8ph~cR4;TiIj`(k7w{? zUb`F$zo5wD@`3xXUdvKt;E?M>H9v-%abEsG$9J(n@+~{VRf#qBPAy9(xV#UPrSxB< zi{sw(y7eMTO>0)x_1k0NnpnQf2~pEa(`d_N=VhcgvlQsbIdz3*m#q*1_!8T^5T(CJ z@T%C-60F*q!Adil--RhBUl()&ff@yJQ)=k6O1G7KfcjO5EO8ZhX=;=A7nefD|B z>IRjyQ^{T9#d3){P*N)Z|1;#Lty>Hv?>A;r5J9sC0~5n4VLXs@o6f|@QXBTi3q!MY zS~#khCKfY4eZt#s&d#5`$-czTpE$W`P^+7yLG+K>+gy7&q(E54R=;mHc0?+o+8!@_ z_!)B=k1crX?iam2ZD?4L8I6r=_+Wunw7@XOKiYTOw)8!Kw{oQ+=>Da0$bFqb^<#zz zHsXEyFMnCV{GznT4Yk~Q2w00*pU>AcsyZuNO+5#)jJUA>fnC*b^h3NDmKuk{#QL*v9E} zgtBNBd1-=qpm%snER5E#bJaA2AhI9_?i*%}^J+Qu`>to)-L1*deqMkGYOjz5W;(xt z>6-U$u)KgW`JiJ@xOw)@G~l{GqOMA|V|&xsXX)k6u^a>eJbp>idL z$ohL^eHj*afgitr)iFyAo+bcs%PJVyraw0c(hN`Ny0K*`Ca8NST2J@)QMv zS-LnCRNv_MWyg&{(sDxRTe`xMqW2MnKD@#cs_P8$x_0GiZ8X+u6e9+sS4tx^jIENBCL;Sm=#{7VDBJ% z(CygvA8gjH#4-9dB2hJThrd_;k#UPqW7W@RLNqICS1|j7oDF`+0peZrLcW>@DeGqp zpLh>4qDlbF3G5zp{>L<$1KlW$;&6F);_aVXi32%E)8b#8PxWM%nMIVBIC)zNh5HzA z9(KuL?LU3Jwg&1RUFYJN#Z3I9t;Oo8Qu>y6gjXCj&yRGyykn^Y;84b_3qDC!^49#z_!SgVH^W zKr)LHvl!`i1+?r&J{!0C2h}-ZiYKHl)`Hr%M)>Ce#DuCpS3hTUK1#OS1Zs_Qcevru z*FnAoupxq>de2)@W*tdEcGt&BW^p5@j zrK&3N;F$L8TD8}O)?LLOhY)99o7fpkseTfs!k~FoB7O|T_4NTHxS3}At%Pn8uOXqx zQb9D(y#4yp+3UJn?-!$u8kAB^ok*CSB#oCto3f+a33p?pnBB#eFVqB&ruT`w_S+O} z(VOhbSv_i|;j2jzVGqR3)du1^{V z)#h&utO=y+)$2GaJX4GDWE$?yZZ9a_w`gl^J`Rq0`&Bla{bG;9{$OK2D;K^Ci6r=h zY9hXOKA^v(q*7w)%mv3~==QMX8DDM^m%>`2&zN*2xc$R1QG$}q#uURh7GaYOlPu{s z6*;opuJc25LwJnc1IGcqO1j-jfbP|y&nzNsT4Uj15+oO#`c`2VV)Hx10=kS-34kQm z5<9t2_mUGkrq?i~W(cls$Xu2Am!<~cT$1Jr)?OV(+g2*NY#S|g(sT|skh^l`nCbPE@RpHg}3&a?b@y5Eup;A(k%L z2<9UTIO|${h@vG}Qc%$hoKyN|Ba+;<9%w+mRC^x-gVKCbjtNIS8-S(c%zF%62;>2! zfn7u=6b_drJjI!U4(Lf*$FvogO?Xf4Sc=xs#x4Sj@=MU9Ld#v|7B_uK)78@seH%wQ z1CRH|s)txeE5t3=YtT#5WDyN2Bb1iZ3q-w2(&|n*&Ji;qZ*-J+-=#A92-|9v4`(Sc zf5$+b-5k@F_1a%ssy_2|;Gv?OYE->rhY_BUM&xfq_w<3n+)Zf-LuVM$(T}}8g@|*5 zh3k}93s`jQAuV;uG1}$bwIr2{FvS~@UiTy!VyV{+?3ZudpKbB7)ND(h_W9bf9=S*q z;V^O=vKU>B##3G86NG6 z$tN!{fePTFv#ljB%=Vi^qb>fMDZEy(Ljyn4MHDC#@)B=CW&{sbzZ8NL@jA*dE8-Vc zJ$^{Sfzi)>#<`7ZVk>WsUFlbUEvTi=WRY{zQc+EZ(B$olU80|-530Fdrj$vL+7!;& zgfbCW1U*O5WN{R-yKO)+j?CaO-Ert9u{p})?QHQ2^3MdZEI7(miFvr%)7k_03}{$u z#9agMLd?=!olVg7fs4A^O7#8PMVfD|-xbBE;j=QPCl~G$rjK6=A`&EnyMBA!ei{6$~IzKx41Z0Gd zqE$JR&QtBCsF!p$xy@* ze=%|o3LicWt<8uVn}S*3N$9RbGZL0LRJ2de*1MSsXlhW9UZ z^Ml^5$8`fWVbm!%8kit4^&8hi^GfLsZsI;of4+Vyn+Z3@Fo|h$N0_RU>CkRDM{il( z=1ZTS7zaFQ*S7+iih?x)sVJD=J;;jvGugi#P8N|}rcZj;ODc*pn(OqhWVjul5tkQn<`csI@P%r|l_DhZiqdv`Vtd zn{Z$i*eNAKP0P>gbeZ&2nnacEi7R%TyTg~$ag$NIpnCtt%nun$`{im&N9Ol&H!spK z!M4w{H=v>U%^OaWbxKYAHdP>b-gwHs`6%Ew69eZbYd~X--;; z5DisgUKyw4!H3)GmzY{CD4U4V#-tZ5&}q&yf!~sb>B^#H!an~RtBAiwb-3!Wl|R0F zWIsRt32?OX*mKJwI)ogXdl%Deo_8-*g(9<@zj?~35RcCIsfvz{N4tLu4Y|`|@^R<_ zIbl4~I_DfXy9P&B@ow0Cl1014F)v5`^2|k8w-Xx&peK}@)&?Eu0#u9Zr^r24jcn#M z#pcg|$7Q$Og~&8YMCud}*hlIHxopJbHa?D8ALUpYJ)h?icR9D*Upw(naQcekoDM>% ze?M7;=`FVXc7>mtDjt6$fxyA1`$P0*^>Fly`esZ7O23JB(?uQ9F&QM+?Xco6gCBu( zPs$Zv%6hyC`{v2BEIGPry7(}@-M_c2V-Hj7L4)*9Rqrhz-c&jKy+YiqH*Acx8iOrt zJRJL>LGrfQRXLn=GbkU2eyBA=l0_N2p`HrXLj|s-Y73Nb79;m};4<*PDlbi4O~6&U zH2+nDtH;91(XPh=WY+{=IGLODSlBq_!A$ln(Xbnv@WYqvPXP9p&W~U|%X=YEo~>)b zsF&b=vE%7QYt!xPXs$=`LGYcWl|h^9m42NpJ?G`K9t#T__?e?{mx30FSqxkN42I(V zkMhq$_bZ31?ZE>$%w}v3z=MFN3MYbKn)CIUv%|awEP;(Rih_DqPj7Kf*&~!0+{iWC z%CpX{n@3{IY+S}BWQ!X=BZgv(kHWQ#R50J}~uD|tT1FzeE zk7Z+L2cy9Mb3Qg64zMHqCyyP-{x|*DIl*Y>|L(`l^WQum2P-$Yvfw}F19EVJ;nM%) zxi}etG0RSW)aa{PdV#=e3idR6dlxX68VsIR0WgEn?@pc$VEWZX!L*E>nLQY8{SQJM z40z`^GcjUg;bvzw<^ZvqfmP#V1Dcs~oAGd)v2Yr*fw%;b{?A?hgo-=67=izJ^lDuo Q2M;$p5+$XCq9oG)1vDX;_y7O^ literal 0 HcmV?d00001 diff --git a/go.mod b/go.mod index 905f3fd43..ebc81f49d 100644 --- a/go.mod +++ b/go.mod @@ -71,6 +71,7 @@ require ( github.com/go-sql-driver/mysql v1.7.1 github.com/google/go-cmp v0.5.9 github.com/jackc/pgx/v5 v5.4.1 + github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 github.com/mattn/go-sqlite3 v1.14.17 github.com/microcosm-cc/bluemonday v1.0.24 github.com/pinecone-io/go-pinecone v0.3.0 diff --git a/go.sum b/go.sum index 6e75d73f1..4e124f572 100644 --- a/go.sum +++ b/go.sum @@ -257,6 +257,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= +github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= From c27d44f4ddd6e198f685946aea947127d56d0091 Mon Sep 17 00:00:00 2001 From: Matt Plachter Date: Wed, 12 Jul 2023 10:22:01 -0400 Subject: [PATCH 2/2] increase timeout for lint --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4ce680152..6d3a322be 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,7 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v3.6.0 with: - args: --timeout=2m + args: --timeout=4m build-examples: runs-on: ubuntu-latest steps: