From 1d6ebcef1333d8d8c65a6fbbf64bc9c238400bee Mon Sep 17 00:00:00 2001 From: Richard Hallett Date: Wed, 5 Apr 2023 13:01:38 +0200 Subject: [PATCH] Add support for denying known bots --- data/COUNTER_Robots_list.json | 1357 +++++++++++++++++++++++++++++++++ docker/web/Dockerfile | 3 + internal/app/auth/auth.go | 4 + internal/app/net/http.go | 38 + 4 files changed, 1402 insertions(+) create mode 100644 data/COUNTER_Robots_list.json diff --git a/data/COUNTER_Robots_list.json b/data/COUNTER_Robots_list.json new file mode 100644 index 0000000..af6234f --- /dev/null +++ b/data/COUNTER_Robots_list.json @@ -0,0 +1,1357 @@ +[ + { + "pattern": "bot", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Buck\\/[0-9]", + "last_changed": "2019-11-19" + }, + { + "pattern": "spider", + "last_changed": "2017-08-08" + }, + { + "pattern": "crawl", + "last_changed": "2017-08-08" + }, + { + "pattern": "^.?$", + "last_changed": "2017-08-08" + }, + { + "pattern": "[^a]fish", + "last_changed": "2017-08-08" + }, + { + "pattern": "^IDA$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^ruby$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^@ozilla\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "^脝脝陆芒潞贸碌脛$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^破解后的$", + "last_changed": "2017-08-08" + }, + { + "pattern": "AddThis", + "last_changed": "2017-08-08" + }, + { + "pattern": "A6-Indexer", + "last_changed": "2019-11-19", + "description": "A6 Corp's bot" + }, + { + "pattern": "ADmantX", + "last_changed": "2017-08-08" + }, + { + "pattern": "alexa", + "last_changed": "2017-08-08" + }, + { + "pattern": "Alexandria(\\s|\\+)prototype(\\s|\\+)project", + "last_changed": "2017-08-08" + }, + { + "pattern": "AllenTrack", + "last_changed": "2017-08-08" + }, + { + "pattern": "almaden", + "last_changed": "2017-08-08" + }, + { + "pattern": "appie", + "last_changed": "2017-08-08" + }, + { + "pattern": "API[\\+\\s]scraper", + "last_changed": "2018-02-15", + "description": "API scrapers are robots." + }, + { + "pattern": "Arachni", + "last_changed": "2018-12-13", + "description": "http://www.arachni-scanner.com/" + }, + { + "pattern": "Arachmo", + "last_changed": "2017-08-08" + }, + { + "pattern": "architext", + "last_changed": "2017-08-08" + }, + { + "pattern": "ArchiveTeam", + "last_changed": "2018-02-15", + "description": "ArchiveTeam is a robot that is archiving the web.", + "url": "https://www.archiveteam.org/" + }, + { + "pattern": "aria2\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "arks", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Array$", + "last_changed": "2017-08-08" + }, + { + "pattern": "asterias", + "last_changed": "2017-08-08" + }, + { + "pattern": "atomz", + "last_changed": "2017-08-08" + }, + { + "pattern": "axios\\/\\d", + "last_changed": "2022-03-04", + "description": "Promise based HTTP client for the browser and Node.js", + "url": "https://github.com/axios/axios" + }, + { + "pattern": "BDFetch", + "last_changed": "2017-08-08" + }, + { + "pattern": "Betsie", + "last_changed": "2017-08-08" + }, + { + "pattern": "baidu", + "last_changed": "2017-08-08" + }, + { + "pattern": "biglotron", + "last_changed": "2017-08-08" + }, + { + "pattern": "BingPreview", + "last_changed": "2017-08-08" + }, + { + "pattern": "binlar", + "last_changed": "2017-08-08" + }, + { + "pattern": "bjaaland", + "last_changed": "2017-08-08" + }, + { + "pattern": "Blackboard[\\+\\s]Safeassign", + "last_changed": "2017-08-08" + }, + { + "pattern": "blaiz-bee", + "last_changed": "2019-11-19" + }, + { + "pattern": "bloglines", + "last_changed": "2017-08-08" + }, + { + "pattern": "blogpulse", + "last_changed": "2017-08-08" + }, + { + "pattern": "boitho\\.com-dc", + "last_changed": "2019-11-19" + }, + { + "pattern": "bookmark-manager", + "last_changed": "2019-11-19" + }, + { + "pattern": "Brutus\\/AET", + "last_changed": "2017-08-08" + }, + { + "pattern": "BUbiNG", + "last_changed": "2018-02-07", + "description": "BUbiNG bot scrapes the internet for contacts and news articles that would be of interest to their students and university.", + "url": "http://law.di.unimi.it/BUbiNG.html" + }, + { + "pattern": "bwh3_user_agent", + "last_changed": "2017-08-08" + }, + { + "pattern": "CakePHP", + "last_changed": "2017-08-08" + }, + { + "pattern": "celestial", + "last_changed": "2017-08-08" + }, + { + "pattern": "centuryb", + "last_changed": "2022-04-27" + }, + { + "pattern": "cfnetwork", + "last_changed": "2017-08-08" + }, + { + "pattern": "checklink", + "last_changed": "2017-08-08" + }, + { + "pattern": "checkprivacy", + "last_changed": "2017-08-08" + }, + { + "pattern": "China\\sLocal\\sBrowse\\s2\\.6", + "last_changed": "2017-08-08" + }, + { + "pattern": "Citoid", + "last_changed": "2020-02-25", + "url": "https://www.mediawiki.org/wiki/Citoid" + }, + { + "pattern": "cloakDetect", + "last_changed": "2017-08-08" + }, + { + "pattern": "coccoc\\/1\\.0", + "last_changed": "2017-08-08" + }, + { + "pattern": "Code\\sSample\\sWeb\\sClient", + "last_changed": "2017-08-08" + }, + { + "pattern": "ColdFusion", + "last_changed": "2017-08-08" + }, + { + "pattern": "collection@infegy.com", + "last_changed": "2017-08-08" + }, + { + "pattern": "com\\.plumanalytics", + "last_changed": "2017-08-08" + }, + { + "pattern": "combine", + "last_changed": "2017-08-08" + }, + { + "pattern": "contentmatch", + "last_changed": "2017-08-08" + }, + { + "pattern": "ContentSmartz", + "last_changed": "2017-08-08" + }, + { + "pattern": "convera", + "last_changed": "2017-08-08" + }, + { + "pattern": "core", + "last_changed": "2017-08-08" + }, + { + "pattern": "Cortana", + "last_changed": "2019-10-30", + "description": "Cortana is a virtual assistant created by Microsoft", + "url": "https://www.microsoft.com/en-us/cortana" + }, + { + "pattern": "CoverScout", + "last_changed": "2017-08-08" + }, + { + "pattern": "crusty\\/\\d", + "last_changed": "2021-07-05", + "description": "Crusty web crawler", + "url": "https://github.com/let4be/crusty" + }, + { + "pattern": "curl\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "cursor", + "last_changed": "2017-08-08" + }, + { + "pattern": "custo", + "last_changed": "2017-08-08" + }, + { + "pattern": "DataCha0s\\/2\\.0", + "last_changed": "2017-08-08" + }, + { + "pattern": "daum(oa)?", + "last_changed": "2020-09-10", + "url": "https://cs.daum.net/faq/15/4118.html?faqId=28966" + }, + { + "pattern": "^\\%?default\\%?$", + "last_changed": "2017-08-08" + }, + { + "pattern": "DeuSu\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "Dispatch\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "Docoloc", + "last_changed": "2018-02-15", + "description": "Docoloc bot is a web scraping bot used by the Docoloc plagiarism detection service. Docoloc bot searches papers for text fragments that also occur in other documents by using web scraping. Docoloc bot respects robots.txt.", + "url": "https://www.distilnetworks.com/bot-directory/bot/docoloc/" + }, + { + "pattern": "docomo", + "last_changed": "2017-08-08" + }, + { + "pattern": "Download\\+Master", + "last_changed": "2017-08-08" + }, + { + "pattern": "Drupal", + "last_changed": "2020-07-20", + "description": "Drupal is a content management system. Some Drupal instances perform harvesting or crawling of web resources and should be ignored." + }, + { + "pattern": "DSurf", + "last_changed": "2017-08-08" + }, + { + "pattern": "DTS Agent", + "last_changed": "2018-02-15", + "description": "DTS Agent is an e-mail harvesting robot." + }, + { + "pattern": "EasyBib[\\+\\s]AutoCite[\\+\\s]", + "last_changed": "2018-02-15", + "description": "Easybib Autocite bot is a web scraping bot that allows users to type in a URL and Easybib bot will automatically scrape the necessary information to properly cite the web page, creating a bibliography. Easybib Autocite bot is not known to cause harm and helps give credit to original content owners.", + "url": "https://www.distilnetworks.com/bot-directory/bot/easybib-autocite/" + }, + { + "pattern": "easydl", + "last_changed": "2017-08-08" + }, + { + "pattern": "EBSCO\\sEJS\\sContent\\sServer", + "last_changed": "2017-08-08" + }, + { + "pattern": "EcoSearch", + "last_changed": "2020-07-02" + }, + { + "pattern": "ELinks\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "EmailSiphon", + "last_changed": "2017-08-08" + }, + { + "pattern": "EmailWolf", + "last_changed": "2017-08-08" + }, + { + "pattern": "Embedly", + "last_changed": "2017-08-08" + }, + { + "pattern": "EThOS\\+\\(British\\+Library\\)", + "last_changed": "2017-08-08" + }, + { + "pattern": "facebookexternalhit\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "favorg", + "last_changed": "2017-08-08" + }, + { + "pattern": "Faveeo\\/\\d", + "last_changed": "2022-03-04", + "url": "https://www.faveeo.com" + }, + { + "pattern": "FDM(\\s|\\+)\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "Feedbin", + "last_changed": "2019-01-22" + }, + { + "pattern": "feedburner", + "last_changed": "2017-08-08" + }, + { + "pattern": "FeedFetcher", + "last_changed": "2017-08-08" + }, + { + "pattern": "feedreader", + "last_changed": "2017-08-08" + }, + { + "pattern": "ferret", + "last_changed": "2017-08-08" + }, + { + "pattern": "Fetch(\\s|\\+)API(\\s|\\+)Request", + "last_changed": "2017-08-08" + }, + { + "pattern": "findlinks", + "last_changed": "2017-08-08" + }, + { + "pattern": "findthatfile", + "last_changed": "2017-08-08" + }, + { + "pattern": "^FileDown$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Filter$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^firefox$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^FOCA", + "last_changed": "2017-08-08" + }, + { + "pattern": "Fulltext", + "last_changed": "2017-08-08" + }, + { + "pattern": "Funnelback", + "last_changed": "2017-08-08" + }, + { + "pattern": "Genieo", + "last_changed": "2018-09-05", + "Listed as a bad bot by Distil Networks": "2018-09-05", + "url": "https://www.distilnetworks.com/bot-directory/bot/genieo/" + }, + { + "pattern": "GetRight", + "last_changed": "2017-08-08" + }, + { + "pattern": "geturl", + "last_changed": "2017-08-08" + }, + { + "pattern": "GigablastOpenSource", + "last_changed": "2020-07-02" + }, + { + "pattern": "G-i-g-a-b-o-t", + "last_changed": "2018-02-15", + "description": "Self-explanatory!" + }, + { + "pattern": "GLMSLinkAnalysis", + "last_changed": "2017-08-08" + }, + { + "pattern": "Goldfire(\\s|\\+)Server", + "last_changed": "2017-08-08" + }, + { + "pattern": "google", + "last_changed": "2017-08-08" + }, + { + "pattern": "Grammarly", + "last_changed": "2018-02-15", + "description": "Grammarly/1.0 bot is a web scraping bot used by the writing tool Grammarly. Grammarly allows users to upload written pieces of work and then scrapes the content looking for grammar mistakes. Grammarly makes sure that everything users type is easy to read, effective, and mistake-free.", + "url": "https://www.distilnetworks.com/bot-directory/bot/grammarly1-0/" + }, + { + "pattern": "GroupHigh\\/\\d", + "last_changed": "2022-03-04", + "description": "GroupHigh is an essential content marketing tool", + "url": "https://www.grouphigh.com" + }, + { + "pattern": "grub", + "last_changed": "2017-08-08" + }, + { + "pattern": "gulliver", + "last_changed": "2017-08-08" + }, + { + "pattern": "gvfs\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "harvest", + "last_changed": "2017-08-08" + }, + { + "pattern": "heritrix", + "last_changed": "2017-08-08" + }, + { + "pattern": "holmes", + "last_changed": "2017-08-08" + }, + { + "pattern": "htdig", + "last_changed": "2017-08-08" + }, + { + "pattern": "htmlparser", + "last_changed": "2017-08-08" + }, + { + "pattern": "HeadlessChrome", + "last_changed": "2021-12-07", + "description": "Headless Chrome is used to automate Chrome requests for testing purposes.", + "url": "https://developers.google.com/web/updates/2017/04/headless-chrome" + }, + { + "pattern": "HttpComponents\\/1.1", + "last_changed": "2017-08-08" + }, + { + "pattern": "HTTPFetcher", + "last_changed": "2017-08-08" + }, + { + "pattern": "http.?client", + "last_changed": "2017-08-08" + }, + { + "pattern": "httpget", + "last_changed": "2017-08-08" + }, + { + "pattern": "httpx", + "last_changed": "2021-11-23", + "url": "https://github.com/projectdiscovery/httpx" + }, + { + "pattern": "httrack", + "last_changed": "2017-08-08" + }, + { + "pattern": "ia_archiver", + "last_changed": "2017-08-08" + }, + { + "pattern": "ichiro", + "last_changed": "2017-08-08" + }, + { + "pattern": "iktomi", + "last_changed": "2017-08-08" + }, + { + "pattern": "ilse", + "last_changed": "2017-08-08" + }, + { + "pattern": "Indy Library", + "last_changed": "2017-08-08" + }, + { + "pattern": "insomnia", + "last_changed": "2022-05-04", + "description": "The open-source, cross-platform API client for GraphQL, REST, and gRPC.", + "url": "https://github.com/Kong/insomnia" + }, + { + "pattern": "^integrity\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "internetseer", + "last_changed": "2017-08-08" + }, + { + "pattern": "intute", + "last_changed": "2017-08-08" + }, + { + "pattern": "iSiloX", + "last_changed": "2017-08-08" + }, + { + "pattern": "iskanie", + "last_changed": "2017-08-08" + }, + { + "pattern": "^java\\/\\d{1,2}.\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "jeeves", + "last_changed": "2017-08-08" + }, + { + "pattern": "Jersey\\/\\d", + "last_changed": "2020-02-07" + }, + { + "pattern": "jobo", + "last_changed": "2017-08-08" + }, + { + "pattern": "Koha", + "last_changed": "2021-12-29", + "description": "Scan all URLs found in of Koha records and displays if resources are available or not.", + "url": "https://wiki.koha-community.org/wiki/Check-url_enhancements" + }, + { + "pattern": "kyluka", + "last_changed": "2017-08-08" + }, + { + "pattern": "larbin", + "last_changed": "2017-08-08" + }, + { + "pattern": "libcurl", + "last_changed": "2017-08-08" + }, + { + "pattern": "libhttp", + "last_changed": "2017-08-08" + }, + { + "pattern": "libwww", + "last_changed": "2017-08-08" + }, + { + "pattern": "lilina", + "last_changed": "2017-08-08" + }, + { + "pattern": "^LinkAnalyser", + "last_changed": "2019-11-19" + }, + { + "pattern": "link.?check", + "last_changed": "2017-08-08" + }, + { + "pattern": "LinkLint-checkonly", + "last_changed": "2017-08-08" + }, + { + "pattern": "^LinkParser\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "^LinkSaver\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "linkscan", + "last_changed": "2017-08-08" + }, + { + "pattern": "LinkTiger", + "last_changed": "2018-02-15", + "description": "LinkTiger is a website link checking product.", + "url": "https://linktiger.com/product/" + }, + { + "pattern": "linkwalker", + "last_changed": "2017-08-08" + }, + { + "pattern": "lipperhey", + "last_changed": "2017-08-08" + }, + { + "pattern": "livejournal\\.com", + "last_changed": "2017-08-08" + }, + { + "pattern": "LOCKSS", + "last_changed": "2017-08-08" + }, + { + "pattern": "LongURL.API", + "last_changed": "2017-08-08" + }, + { + "pattern": "ltx71", + "last_changed": "2017-08-08" + }, + { + "pattern": "lwp", + "last_changed": "2017-08-08" + }, + { + "pattern": "lycos[_+]", + "last_changed": "2018-12-13" + }, + { + "pattern": "MaCoCu", + "last_changed": "2021-11-23", + "url": "https://www.clarin.si/info/macocu-massive-collection-and-curation-of-monolingual-and-bilingual-data/" + }, + { + "pattern": "mail\\.ru", + "url": "https://help.mail.ru/webmaster/indexing/robots.txt/rules/user-agent", + "last_changed": "2017-08-08" + }, + { + "pattern": "MarcEdit", + "last_changed": "2019-12-12" + }, + { + "pattern": "mediapartners-google", + "last_changed": "2019-11-19" + }, + { + "pattern": "megite", + "last_changed": "2017-08-08" + }, + { + "pattern": "MetaInspector", + "last_changed": "2022-05-04", + "description": "Ruby gem for web scraping purposes.", + "url": "https://github.com/metainspector/metainspector" + }, + { + "pattern": "MetaURI[\\+\\s]API\\/\\d\\.\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "Microsoft(\\s|\\+)URL(\\s|\\+)Control", + "last_changed": "2017-08-08" + }, + { + "pattern": "Microsoft Office Existence Discovery", + "last_changed": "2017-08-08" + }, + { + "pattern": "Microsoft Office Protocol Discovery", + "last_changed": "2017-08-08" + }, + { + "pattern": "Microsoft-WebDAV-MiniRedir", + "last_changed": "2017-08-08" + }, + { + "pattern": "mimas", + "last_changed": "2017-08-08" + }, + { + "pattern": "mnogosearch", + "last_changed": "2017-08-08" + }, + { + "pattern": "moget", + "last_changed": "2017-08-08" + }, + { + "pattern": "motor", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla.4\\.0$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/4\\.0\\+\\(compatible;\\)$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/4\\.0\\+\\(compatible;\\+ICS\\)$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/4\\.5\\+\\[en]\\+\\(Win98;\\+I\\)$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla.5\\.0$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/5.0\\+\\(compatible;\\+MSIE\\+6\\.0;\\+Windows\\+NT\\+5\\.0\\)$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/5\\.0\\+like\\+Gecko$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Mozilla\\/5.0(\\s|\\+)Gecko\\/20100115(\\s|\\+)Firefox\\/3.6$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^MSIE", + "last_changed": "2017-08-08" + }, + { + "pattern": "MuscatFerre", + "last_changed": "2017-08-08" + }, + { + "pattern": "myweb", + "last_changed": "2017-08-08" + }, + { + "pattern": "nagios", + "last_changed": "2017-08-08" + }, + { + "pattern": "^NetAnts\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "netcraft", + "last_changed": "2017-08-08" + }, + { + "pattern": "netluchs", + "last_changed": "2017-08-08" + }, + { + "pattern": "nettle", + "last_changed": "2022-04-27", + "description": "AI company, doing natural language processing.", + "url": "https://www.nettle.sk" + }, + { + "pattern": "newspaper\\/\\d", + "last_changed": "2021-07-05", + "description": "Python 3 library for news, full-text, and article metadata extraction.", + "url": "https://github.com/codelucas/newspaper" + }, + { + "pattern": "ng\\/2\\.", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Ning\\/\\d", + "last_changed": "2019-08-06" + }, + { + "pattern": "no_user_agent", + "last_changed": "2017-08-08" + }, + { + "pattern": "nomad", + "last_changed": "2017-08-08" + }, + { + "pattern": "nutch", + "last_changed": "2017-08-08" + }, + { + "pattern": "^oaDOI$", + "last_changed": "2018-02-15", + "description": "oaDOI is a service that indexes open access articles.", + "url": "https://oadoi.org/" + }, + { + "pattern": "ocelli", + "last_changed": "2017-08-08" + }, + { + "pattern": "Offline(\\s|\\+)Navigator", + "last_changed": "2017-08-08" + }, + { + "pattern": "OgScrper", + "last_changed": "2020-07-02" + }, + { + "pattern": "okhttp", + "last_changed": "2020-02-25", + "description": "okhttp is a Java HTTP client library.", + "url": "http://square.github.io/okhttp/" + }, + { + "pattern": "onetszukaj", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Opera\\/4$", + "last_changed": "2017-08-08" + }, + { + "pattern": "OurBrowser", + "last_changed": "2017-08-08" + }, + { + "pattern": "panscient", + "last_changed": "2017-08-08" + }, + { + "pattern": "parsijoo", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Pattern\\/\\d", + "last_changed": "2020-02-25", + "url": "https://www.clips.uantwerpen.be/pattern" + }, + { + "pattern": "Pcore-HTTP", + "last_changed": "2019-11-19" + }, + { + "pattern": "pear\\.php\\.net", + "last_changed": "2017-08-08" + }, + { + "pattern": "perman", + "last_changed": "2017-08-08" + }, + { + "pattern": "PHP\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "pidcheck", + "last_changed": "2019-10-18", + "description": "PidCheck is a generic crawler for extracting data about PiD's from landing pages and doing some calculation on the health of the link", + "url": "https://github.com/datacite/pidcheck" + }, + { + "pattern": "pioneer", + "last_changed": "2017-08-08" + }, + { + "pattern": "playmusic\\.com", + "last_changed": "2017-08-08" + }, + { + "pattern": "playstarmusic\\.com", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Postgenomic(\\s|\\+)v2", + "last_changed": "2017-08-08" + }, + { + "pattern": "powermarks", + "last_changed": "2017-08-08" + }, + { + "pattern": "proximic", + "last_changed": "2017-08-08" + }, + { + "pattern": "PycURL", + "last_changed": "2017-08-08" + }, + { + "pattern": "python", + "last_changed": "2017-08-08" + }, + { + "pattern": "Qwantify", + "last_changed": "2017-08-08" + }, + { + "pattern": "rambler", + "last_changed": "2017-08-08" + }, + { + "pattern": "ReactorNetty\\/\\d", + "last_changed": "2020-02-07" + }, + { + "pattern": "Readpaper", + "last_changed": "2017-08-08" + }, + { + "pattern": "redalert", + "last_changed": "2017-08-08" + }, + { + "pattern": "RestSharp", + "last_changed": "2022-05-04", + "description": "Simple REST and HTTP API Client for .NET", + "url": "https://github.com/restsharp/RestSharp" + }, + { + "pattern": "Riddler", + "last_changed": "2018-02-15", + "description": "Riddler is an online research project which investigates algorithms for mapping the topology of the Internet. Riddler bot scrapes data about public systems to use in its projects.", + "url": "https://www.distilnetworks.com/bot-directory/bot/riddler/" + }, + { + "pattern": "robozilla", + "last_changed": "2017-08-08" + }, + { + "pattern": "rss", + "last_changed": "2017-08-08" + }, + { + "pattern": "scan4mail", + "last_changed": "2017-08-08" + }, + { + "pattern": "scientificcommons", + "last_changed": "2017-08-08" + }, + { + "pattern": "scirus", + "last_changed": "2017-08-08" + }, + { + "pattern": "scooter", + "last_changed": "2017-08-08" + }, + { + "pattern": "Scrapy\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "ScoutJet", + "last_changed": "2019-02-12", + "url": "http://www.scoutjet.com/" + }, + { + "pattern": "^scrutiny\\/\\d", + "last_changed": "2017-08-08" + }, + { + "pattern": "SearchBloxIntra", + "last_changed": "2017-08-08" + }, + { + "pattern": "shoutcast", + "last_changed": "2017-08-08" + }, + { + "pattern": "Site24x7", + "last_changed": "2020-02-14" + }, + { + "pattern": "SkypeUriPreview", + "last_changed": "2017-08-08" + }, + { + "pattern": "slurp", + "last_changed": "2017-08-08" + }, + { + "pattern": "sogou", + "last_changed": "2017-08-08" + }, + { + "pattern": "speedy", + "last_changed": "2017-08-08" + }, + { + "pattern": "sqlmap", + "last_changed": "2020-02-25", + "url": "https://github.com/sqlmapproject/sqlmap" + }, + { + "pattern": "SrceDAMP", + "last_changed": "2020-10-20", + "url": "https://haw.nsk.hr/en/frequently-asked-questions/" + }, + { + "pattern": "Strider", + "last_changed": "2017-08-08" + }, + { + "pattern": "summify", + "last_changed": "2017-08-08" + }, + { + "pattern": "sunrise", + "last_changed": "2017-08-08" + }, + { + "pattern": "Sysomos", + "last_changed": "2017-08-08" + }, + { + "pattern": "T\\-H\\-U\\-N\\-D\\-E\\-R\\-S\\-T\\-O\\-N\\-E", + "last_changed": "2017-08-08" + }, + { + "pattern": "tailrank", + "last_changed": "2017-08-08" + }, + { + "pattern": "Teleport(\\s|\\+)Pro", + "last_changed": "2017-08-08" + }, + { + "pattern": "Teoma", + "last_changed": "2017-08-08" + }, + { + "pattern": "The[\\+\\s]Knowledge[\\+\\s]AI", + "last_changed": "2021-11-23", + "url": "https://www.webmasterworld.com/search_engine_spiders/4896765.htm" + }, + { + "pattern": "titan", + "last_changed": "2017-08-08" + }, + { + "pattern": "^Traackr\\.com$", + "last_changed": "2017-08-08" + }, + { + "pattern": "Trello", + "last_changed": "2021-03-30" + }, + { + "pattern": "Trove", + "last_changed": "2017-08-08" + }, + { + "pattern": "Turnitin", + "last_changed": "2020-07-01", + "description": "This robot collects content from the Internet for the sole purpose of helping educational institutions prevent plagiarism.", + "url": "https://turnitin.com/robot/crawlerinfo.html" + }, + { + "pattern": "twiceler", + "last_changed": "2017-08-08" + }, + { + "pattern": "Typhoeus", + "last_changed": "2020-02-25", + "url": "https://github.com/typhoeus/typhoeus" + }, + { + "pattern": "ucsd", + "last_changed": "2017-08-08" + }, + { + "pattern": "ultraseek", + "last_changed": "2017-08-08" + }, + { + "pattern": "^undefined$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^unknown$", + "last_changed": "2017-08-08" + }, + { + "pattern": "Unpaywall", + "last_changed": "2019-07-31" + }, + { + "pattern": "URL2File", + "last_changed": "2017-08-08" + }, + { + "pattern": "urlaliasbuilder", + "last_changed": "2017-08-08" + }, + { + "pattern": "urllib", + "last_changed": "2017-08-08" + }, + { + "pattern": "^user.?agent$", + "last_changed": "2017-08-08" + }, + { + "pattern": "^User-Agent", + "last_changed": "2019-11-19" + }, + { + "pattern": "validator", + "last_changed": "2017-08-08" + }, + { + "pattern": "virus.detector", + "last_changed": "2017-08-08" + }, + { + "pattern": "voila", + "last_changed": "2017-08-08" + }, + { + "pattern": "^voltron$", + "last_changed": "2017-08-08" + }, + { + "pattern": "voyager\\/", + "last_changed": "2017-08-08" + }, + { + "pattern": "w3af\\.org", + "description": "Web Application Attack and Audit Framework", + "url": "https://w3af.org/", + "last_changed": "2017-08-08" + }, + { + "pattern": "Wanadoo", + "last_changed": "2017-08-08" + }, + { + "pattern": "Web(\\s|\\+)Downloader", + "last_changed": "2017-08-08" + }, + { + "pattern": "WebCloner", + "last_changed": "2017-08-08" + }, + { + "pattern": "webcollage", + "last_changed": "2017-08-08" + }, + { + "pattern": "WebCopier", + "last_changed": "2017-08-08" + }, + { + "pattern": "Webinator", + "last_changed": "2017-08-08" + }, + { + "pattern": "weblayers", + "last_changed": "2017-08-08" + }, + { + "pattern": "Webmetrics", + "last_changed": "2017-08-08" + }, + { + "pattern": "webmirror", + "last_changed": "2017-08-08" + }, + { + "pattern": "webmon", + "last_changed": "2017-08-08" + }, + { + "pattern": "weborama-fetcher", + "last_changed": "2019-11-19" + }, + { + "pattern": "webreaper", + "last_changed": "2017-08-08" + }, + { + "pattern": "WebStripper", + "last_changed": "2017-08-08" + }, + { + "pattern": "WebZIP", + "last_changed": "2017-08-08" + }, + { + "pattern": "Wget", + "last_changed": "2017-08-08" + }, + { + "pattern": "WhatsApp", + "last_changed": "2021-03-30" + }, + { + "pattern": "wordpress", + "last_changed": "2017-08-08" + }, + { + "pattern": "worm", + "last_changed": "2017-08-08" + }, + { + "pattern": "www\\.gnip\\.com", + "last_changed": "2017-08-08" + }, + { + "pattern": "WWW-Mechanize", + "last_changed": "2019-11-19" + }, + { + "pattern": "xenu", + "last_changed": "2017-08-08" + }, + { + "pattern": "y!j", + "last_changed": "2017-08-08" + }, + { + "pattern": "yacy", + "last_changed": "2017-08-08" + }, + { + "pattern": "yahoo", + "last_changed": "2017-08-08" + }, + { + "pattern": "yandex", + "last_changed": "2017-08-08" + }, + { + "pattern": "Yeti\\/\\d", + "last_changed": "2019-02-12", + "url": "http://naver.me/spd" + }, + { + "pattern": "Zabbix", + "last_changed": "2021-11-23", + "url": "https://github.com/zabbix/zabbix" + }, + { + "pattern": "ZoteroTranslationServer", + "last_changed": "2022-03-04", + "description": "A Node.js-based server to run Zotero translators", + "url": "https://github.com/zotero/translation-server" + }, + { + "pattern": "zeus", + "last_changed": "2017-08-08" + }, + { + "pattern": "zyborg", + "last_changed": "2017-08-08" + }, + { + "pattern": "7siters", + "last_changed": "2020-02-25", + "url": "https://7ooo.ru/siters" + } +] \ No newline at end of file diff --git a/docker/web/Dockerfile b/docker/web/Dockerfile index 192b80a..a36fdf2 100644 --- a/docker/web/Dockerfile +++ b/docker/web/Dockerfile @@ -20,6 +20,9 @@ USER app # Copy the go binary from the builder to the container COPY --chown=app:app --from=builder /app/webbin /home/app/webbin +# Copy COUNTER robots file to the container +COPY --chown=app:app --from=builder /app/data/COUNTER_Robots_list.json /home/app/data/COUNTER_Robots_list.json + # Set the workdir to app dir WORKDIR /home/app/ diff --git a/internal/app/auth/auth.go b/internal/app/auth/auth.go index 48bc4f3..a38f7d7 100644 --- a/internal/app/auth/auth.go +++ b/internal/app/auth/auth.go @@ -10,6 +10,10 @@ import ( ) func GetAuthToken(config *app.Config) *jwtauth.JWTAuth { + // If no JWTPublic key return + if config.DataCite.JWTPublicKey == "" { + return nil + } publicKeyBlock, _ := pem.Decode([]byte(config.DataCite.JWTPublicKey)) publicKey, err := x509.ParsePKIXPublicKey(publicKeyBlock.Bytes) diff --git a/internal/app/net/http.go b/internal/app/net/http.go index 690fdd2..7702cb8 100644 --- a/internal/app/net/http.go +++ b/internal/app/net/http.go @@ -3,8 +3,10 @@ package net import ( "encoding/json" "fmt" + "io/ioutil" "log" "net/http" + "regexp" "strconv" "strings" @@ -143,6 +145,33 @@ func (s *Http) check(w http.ResponseWriter, r *http.Request) { w.Write([]byte(result.Timestamp.Format("2006-01-02T15:04:05Z"))) } + +// Function to check if useragent is a bot +func isBot(userAgent string) bool { + // Read file with known bots + bots, err := ioutil.ReadFile("data/COUNTER_Robots_list.json") + + if err != nil { + log.Fatal(err) + } + + // Read json file which is a list of objects containing pattern and last changed date + var botsList []map[string]interface{} + json.Unmarshal(bots, &botsList) + + // Loop through list of bots and check if useragent matches pattern + for _, bot := range botsList { + pattern := bot["pattern"].(string) + regex := regexp.MustCompile("(?i)"+pattern) + + if regex.MatchString(userAgent) { + return true + } + } + + return false +} + func (s *Http) createMetric(w http.ResponseWriter, r *http.Request) { // Metric request is different to a eventRequest as only some data comes // from the json body @@ -154,6 +183,15 @@ func (s *Http) createMetric(w http.ResponseWriter, r *http.Request) { return } + // Print useragent + log.Println(r.UserAgent()) + + // Return a bad request if useragent is a bot + if isBot(r.UserAgent()) { + http.Error(w, "Event request denied due to known bot", http.StatusBadRequest) + return + } + // Get potential IP from request clientIp := getRemoteAddr(r)