diff --git a/go.mod b/go.mod index bee181f66..e953ebcb3 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,9 @@ require ( github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/antchfx/htmlquery v1.3.0 // indirect + github.com/antchfx/xmlquery v1.3.17 // indirect + github.com/antchfx/xpath v1.2.4 // indirect github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d // indirect github.com/aymerick/douceur v0.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect @@ -30,6 +33,7 @@ require ( github.com/go-openapi/spec v0.20.4 // indirect github.com/go-openapi/swag v0.22.3 // indirect github.com/go-openapi/validate v0.21.0 // indirect + github.com/gobwas/glob v0.2.3 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/s2a-go v0.1.3 // indirect @@ -42,6 +46,7 @@ require ( github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20221227161230-091c0ba34f0a // indirect github.com/josharian/intern v1.0.0 // indirect + github.com/kennygrant/sanitize v1.2.4 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/copystructure v1.0.0 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect @@ -50,12 +55,13 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/shopspring/decimal v1.2.0 // indirect github.com/spf13/cast v1.3.1 // indirect + github.com/temoto/robotstxt v1.1.2 // indirect go.mongodb.org/mongo-driver v1.11.3 // indirect go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.9.0 // indirect - golang.org/x/net v0.10.0 // indirect golang.org/x/oauth2 v0.8.0 // indirect golang.org/x/sys v0.8.0 // indirect golang.org/x/text v0.9.0 // indirect @@ -71,6 +77,7 @@ require ( github.com/cohere-ai/tokenizer v1.1.2 github.com/go-openapi/strfmt v0.21.3 github.com/go-sql-driver/mysql v1.7.1 + github.com/gocolly/colly v1.2.0 github.com/google/go-cmp v0.5.9 github.com/jackc/pgx/v5 v5.4.1 github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 @@ -82,6 +89,7 @@ require ( github.com/weaviate/weaviate-go-client/v4 v4.8.1 go.starlark.net v0.0.0-20230302034142-4b1e35fe2254 golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 + golang.org/x/net v0.10.0 google.golang.org/api v0.122.0 google.golang.org/grpc v1.55.0 google.golang.org/protobuf v1.30.0 diff --git a/go.sum b/go.sum index b643d5bf2..663e827ac 100644 --- a/go.sum +++ b/go.sum @@ -60,6 +60,13 @@ github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 h1:d+Bc7a5rLufV github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/antchfx/htmlquery v1.3.0 h1:5I5yNFOVI+egyia5F2s/5Do2nFWxJz41Tr3DyfKD25E= +github.com/antchfx/htmlquery v1.3.0/go.mod h1:zKPDVTMhfOmcwxheXUsx4rKJy8KEY/PU6eXr/2SebQ8= +github.com/antchfx/xmlquery v1.3.17 h1:d0qWjPp/D+vtRw7ivCwT5ApH/3CkQU8JOeo3245PpTk= +github.com/antchfx/xmlquery v1.3.17/go.mod h1:Afkq4JIeXut75taLSuI31ISJ/zeq+3jG7TunF7noreA= +github.com/antchfx/xpath v1.2.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= +github.com/antchfx/xpath v1.2.4 h1:dW1HB/JxKvGtJ9WyVGJ0sIoEcqftV3SqIstujI+B9XY= +github.com/antchfx/xpath v1.2.4/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/asaskevich/govalidator v0.0.0-20200907205600-7a23bdc65eef/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/asaskevich/govalidator v0.0.0-20210307081110-f21760c49a8d h1:Byv0BzEl3/e6D5CLfI0j/7hiIEtvGVFPCZ7Ei2oq8iQ= @@ -153,6 +160,10 @@ github.com/gobuffalo/packd v0.1.0/go.mod h1:M2Juc+hhDXf/PnmBANFCqx4DM3wRbgDvnVWe github.com/gobuffalo/packr/v2 v2.0.9/go.mod h1:emmyGweYTm6Kdper+iywB6YK5YzuKchGtJQZ0Odn4pQ= github.com/gobuffalo/packr/v2 v2.2.0/go.mod h1:CaAwI0GPIAv+5wKLtv8Afwl+Cm78K/I/VCm/3ptBN+0= github.com/gobuffalo/syncx v0.0.0-20190224160051-33c29581e754/go.mod h1:HhnNqWY95UYwwW3uSASeV7vtgYkT2t16hJgV3AEPUpw= +github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= +github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= +github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= +github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -249,6 +260,8 @@ github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1 github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaRPx4tDPEn4= github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA= +github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= +github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= @@ -302,6 +315,8 @@ github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFR github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= +github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= github.com/sirupsen/logrus v1.4.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= @@ -327,6 +342,8 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= github.com/weaviate/weaviate v1.19.0 h1:JKmScZZ5VWVESCkji37bT1cNFCCRIZrne7ENoRHT1vM= @@ -439,6 +456,7 @@ golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qx golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= +golang.org/x/net v0.5.0/go.mod h1:DivGGAXEgPSlEBzxGzZI+ZLohi+xUj054jfeKui00ws= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= @@ -503,6 +521,7 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -510,6 +529,7 @@ golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9sn golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.0.0-20220526004731-065cf7ba2467/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= +golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -521,6 +541,7 @@ golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= diff --git a/tools/scraper/doc.go b/tools/scraper/doc.go new file mode 100644 index 000000000..ee51f76bc --- /dev/null +++ b/tools/scraper/doc.go @@ -0,0 +1,3 @@ +// Package scraper contains an implementation of the tool interface for +// a web scraping tool. +package scraper diff --git a/tools/scraper/options.go b/tools/scraper/options.go new file mode 100644 index 000000000..566dbd584 --- /dev/null +++ b/tools/scraper/options.go @@ -0,0 +1,103 @@ +package scraper + +type Options func(*Scraper) + +// WithMaxDepth sets the maximum depth for the Scraper. +// +// Default value: 1 +// +// maxDepth: the maximum depth to set. +// Returns: an Options function. +func WithMaxDepth(maxDepth int) Options { + return func(o *Scraper) { + o.MaxDepth = maxDepth + } +} + +// WithParallelsNum sets the number of maximum allowed concurrent +// requests of the matching domains +// +// Default value: 2 +// +// parallels: the number of parallels to set. +// Returns: the updated Scraper options. +func WithParallelsNum(parallels int) Options { + return func(o *Scraper) { + o.Parallels = parallels + } +} + +// WithDelay creates an Options function that sets the delay of a Scraper. +// +// The delay parameter specifies the amount of time in milliseconds that +// the Scraper should wait between requests. +// +// Default value: 3 +// +// delay: the delay to set. +// Returns: an Options function. +func WithDelay(delay int64) Options { + return func(o *Scraper) { + o.Delay = delay + } +} + +// WithAsync sets the async option for the Scraper. +// +// Default value: true + +// async: The boolean value indicating if the scraper should run asynchronously. +// Returns a function that sets the async option for the Scraper. +func WithAsync(async bool) Options { + return func(o *Scraper) { + o.Async = async + } +} + +// WithNewBlacklist creates an Options function that replaces +// the list of url endpoints to be excluded from the scraping, +// with a new list. +// +// Default value: +// +// []string{ +// "login", +// "signup", +// "signin", +// "register", +// "logout", +// "download", +// "redirect", +// }, +// +// blacklist: slice of strings with url endpoints to be excluded from the scraping. +// Returns: an Options function. +func WithNewBlacklist(blacklist []string) Options { + return func(o *Scraper) { + o.Blacklist = blacklist + } +} + +// WithBlacklist creates an Options function that appends +// the url endpoints to be excluded from the scraping, +// to the current list +// +// Default value: +// +// []string{ +// "login", +// "signup", +// "signin", +// "register", +// "logout", +// "download", +// "redirect", +// }, +// +// blacklist: slice of strings with url endpoints to be excluded from the scraping. +// Returns: an Options function. +func WithBlacklist(blacklist []string) Options { + return func(o *Scraper) { + o.Blacklist = append(o.Blacklist, blacklist...) + } +} diff --git a/tools/scraper/scraper.go b/tools/scraper/scraper.go new file mode 100644 index 000000000..03bd4edca --- /dev/null +++ b/tools/scraper/scraper.go @@ -0,0 +1,220 @@ +package scraper + +import ( + "context" + "errors" + "fmt" + "net/url" + "strings" + "time" + + "github.com/gocolly/colly" + "github.com/tmc/langchaingo/tools" +) + +const ( + DefualtMaxDept = 1 + DefualtParallels = 2 + DefualtDelay = 3 + DefualtAsync = true +) + +var ErrScrapingFailed = errors.New("scraper could not read URL, or scraping is not allowed for provided URL") + +type Scraper struct { + MaxDepth int + Parallels int + Delay int64 + Blacklist []string + Async bool +} + +var _ tools.Tool = Scraper{} + +// New creates a new instance of Scraper with the provided options. +// +// The options parameter is a variadic argument allowing the user to specify +// custom configuration options for the Scraper. These options can be +// functions that modify the Scraper's properties. +// +// The function returns a pointer to a Scraper instance and an error. The +// error value is nil if the Scraper is created successfully. +func New(options ...Options) (*Scraper, error) { + scraper := &Scraper{ + MaxDepth: DefualtMaxDept, + Parallels: DefualtParallels, + Delay: int64(DefualtDelay), + Async: DefualtAsync, + Blacklist: []string{ + "login", + "signup", + "signin", + "register", + "logout", + "download", + "redirect", + }, + } + + for _, opt := range options { + opt(scraper) + } + + return scraper, nil +} + +// Name returns the name of the Scraper. +// +// No parameters. +// Returns a string. +func (s Scraper) Name() string { + return "Web Scraper" +} + +// Description returns the description of the Go function. +// +// There are no parameters. +// It returns a string. +func (s Scraper) Description() string { + return ` + Web Scraper will scan a url and return the content of the web page. + Input should be a working url. + ` +} + +// Call scrapes a website and returns the site data. +// +// The function takes a context.Context object for managing the execution +// context and a string input representing the URL of the website to be scraped. +// It returns a string containing the scraped data and an error if any. +// +//nolint:all +func (s Scraper) Call(ctx context.Context, input string) (string, error) { + _, err := url.ParseRequestURI(input) + if err != nil { + return "", fmt.Errorf("%s: %w", ErrScrapingFailed, err) + } + + c := colly.NewCollector( + colly.MaxDepth(s.MaxDepth), + colly.Async(s.Async), + ) + + err = c.Limit(&colly.LimitRule{ + DomainGlob: "*", + Parallelism: s.Parallels, + Delay: time.Duration(s.Delay) * time.Second, + }) + + if err != nil { + return "", fmt.Errorf("%s: %w", ErrScrapingFailed, err) + } + + var siteData strings.Builder + homePageLinks := make(map[string]bool) + scrapedLinks := make(map[string]bool) + + c.OnRequest(func(r *colly.Request) { + if ctx.Err() != nil { + r.Abort() + } + }) + + c.OnHTML("html", func(e *colly.HTMLElement) { + currentURL := e.Request.URL.String() + + // Only process the page if it hasn't been visited yet + if !scrapedLinks[currentURL] { + scrapedLinks[currentURL] = true + + siteData.WriteString("\n\nPage URL: " + currentURL) + + title := e.ChildText("title") + if title != "" { + siteData.WriteString("\nPage Title: " + title) + } + + description := e.ChildAttr("meta[name=description]", "content") + if description != "" { + siteData.WriteString("\nPage Description: " + description) + } + + siteData.WriteString("\nHeaders:") + e.ForEach("h1, h2, h3, h4, h5, h6", func(_ int, el *colly.HTMLElement) { + siteData.WriteString("\n" + el.Text) + }) + + siteData.WriteString("\nContent:") + e.ForEach("p", func(_ int, el *colly.HTMLElement) { + siteData.WriteString("\n" + el.Text) + }) + + if currentURL == input { + e.ForEach("a", func(_ int, el *colly.HTMLElement) { + link := el.Attr("href") + if link != "" && !homePageLinks[link] { + homePageLinks[link] = true + siteData.WriteString("\nLink: " + link) + } + }) + } + } + }) + + c.OnHTML("a[href]", func(e *colly.HTMLElement) { + link := e.Attr("href") + absoluteLink := e.Request.AbsoluteURL(link) + + // Parse the link to get the hostname + u, err := url.Parse(absoluteLink) + if err != nil { + // Handle the error appropriately + return + } + + // Check if the link's hostname matches the current request's hostname + if u.Hostname() != e.Request.URL.Hostname() { + return + } + + // Check for redundant pages + for _, item := range s.Blacklist { + if strings.Contains(u.Path, item) { + return + } + } + + // Normalize the path to treat '/' and '/index.html' as the same path + if u.Path == "/index.html" || u.Path == "" { + u.Path = "/" + } + + // Only visit the page if it hasn't been visited yet + if !scrapedLinks[u.String()] { + err := c.Visit(u.String()) + if err != nil { + siteData.WriteString(fmt.Sprintf("\nError following link %s: %v", link, err)) + } + } + }) + + err = c.Visit(input) + if err != nil { + return "", fmt.Errorf("%s: %w", ErrScrapingFailed, err) + } + + select { + case <-ctx.Done(): + return "", ctx.Err() + default: + c.Wait() + } + + // Append all scraped links + siteData.WriteString("\n\nScraped Links:") + for link := range scrapedLinks { + siteData.WriteString("\n" + link) + } + + return siteData.String(), nil +}