diff --git a/apps/sdk-api/src/bots/bots.readme.md b/apps/sdk-api/src/bots/bots.readme.md new file mode 100644 index 00000000..2f504974 --- /dev/null +++ b/apps/sdk-api/src/bots/bots.readme.md @@ -0,0 +1,6 @@ +# Device Detector - The Universal Device Detection library for parsing User Agents + +> @link https://matomo.org +> @license http://www.gnu.org/licenses/lgpl.html LGPL v3 or lat + +[bots.ts](./bots.ts) is based on matomo bots.yml file. You can see the original version here [here](https://raw.githubusercontent.com/matomo-org/device-detector/master/regexes/bots.yml). diff --git a/apps/sdk-api/src/bots/bots.ts b/apps/sdk-api/src/bots/bots.ts new file mode 100644 index 00000000..421891fa --- /dev/null +++ b/apps/sdk-api/src/bots/bots.ts @@ -0,0 +1,4901 @@ +export default [ + { + regex: 'monitoring360bot', + name: '360 Monitoring', + category: 'Site Monitor', + url: 'https://www.360monitoring.io', + producer: { + name: 'Plesk International GmbH', + url: 'https://www.plesk.com', + }, + }, + { + regex: 'Cloudflare-Healthchecks', + name: 'Cloudflare Health Checks', + category: 'Service Agent', + url: 'https://developers.cloudflare.com/health-checks/', + producer: { + name: 'CloudFlare', + url: 'https://www.cloudflare.com/', + }, + }, + { + regex: '360Spider', + name: '360Spider', + category: 'Search bot', + url: 'https://www.so.com/help/help_3_2.html', + producer: { + name: 'Online Media Group, Inc.', + url: '', + }, + }, + { + regex: 'Aboundex', + name: 'Aboundexbot', + category: 'Search bot', + url: 'http://www.aboundex.com/crawler/', + producer: { + name: 'Aboundex.com', + url: 'http://www.aboundex.com', + }, + }, + { + regex: 'AcoonBot', + name: 'Acoon', + category: 'Search bot', + url: 'http://www.acoon.de/robot.asp', + producer: { + name: 'Acoon GmbH', + url: 'http://www.acoon.de', + }, + }, + { + regex: 'AddThis\\.com', + name: 'AddThis.com', + category: 'Social Media Agent', + url: '', + producer: { + name: 'Clearspring Technologies, Inc.', + url: 'http://www.clearspring.com', + }, + }, + { + regex: 'AhrefsBot', + name: 'aHrefs Bot', + category: 'Crawler', + url: 'https://ahrefs.com/robot', + producer: { + name: 'Ahrefs Pte Ltd', + url: 'https://ahrefs.com/robot', + }, + }, + { + regex: 'AhrefsSiteAudit/[\\d.]+', + name: 'AhrefsSiteAudit', + category: 'Site Monitor', + url: 'https://ahrefs.com/robot/site-audit', + producer: { + name: 'Ahrefs Pte Ltd', + url: 'https://ahrefs.com/', + }, + }, + { + regex: 'ia_archiver|alexabot|verifybot', + name: 'Alexa Crawler', + category: 'Search bot', + url: 'https://support.alexa.com/hc/en-us/sections/200100794-Crawlers', + producer: { + name: 'Alexa Internet', + url: 'https://www.alexa.com', + }, + }, + { + regex: 'alexa site audit', + name: 'Alexa Site Audit', + category: 'Site Monitor', + url: 'https://support.alexa.com/hc/en-us/articles/200450194', + producer: { + name: 'Alexa Internet', + url: 'https://www.alexa.com', + }, + }, + { + regex: 'Amazonbot', + name: 'Amazon Bot', + category: 'Crawler', + url: 'https://developer.amazon.com/support/amazonbot', + producer: { + name: 'Amazon.com, Inc.', + url: 'https://www.amazon.com/', + }, + }, + { + regex: 'Amazon[ -]Route ?53[ -]Health[ -]Check[ -]Service', + name: 'Amazon Route53 Health Check', + category: 'Service Agent', + producer: { + name: 'Amazon Web Services', + url: 'https://aws.amazon.com/', + }, + }, + { + regex: 'AmorankSpider', + name: 'Amorank Spider', + category: 'Crawler', + url: 'http://amorank.com/webcrawler.html', + producer: { + name: 'Amorank', + url: 'http://www.amorank.com', + }, + }, + { + regex: 'ApacheBench', + name: 'ApacheBench', + category: 'Benchmark', + url: 'https://httpd.apache.org/docs/2.4/programs/ab.html', + producer: { + name: 'The Apache Software Foundation', + url: 'https://www.apache.org/foundation/', + }, + }, + { + regex: 'Applebot', + name: 'Applebot', + category: 'Crawler', + url: 'https://support.apple.com/en-us/HT204683', + producer: { + name: 'Apple Inc', + url: 'https://www.apple.com', + }, + }, + { + regex: 'AppSignalBot', + name: 'AppSignalBot', + category: 'Site Monitor', + url: 'https://docs.appsignal.com/uptime-monitoring/', + producer: { + name: 'AppSignal', + url: 'https://appsignal.com/', + }, + }, + { + regex: 'Arachni', + name: 'Arachni', + category: 'Security Checker', + url: 'https://www.arachni-scanner.com/', + producer: { + name: 'Sarosys LLC', + url: 'https://www.sarosys.com/', + }, + }, + { + regex: 'AspiegelBot', + name: 'AspiegelBot', + category: 'Crawler', + url: 'https://aspiegel.com/', + producer: { + name: 'Huawei', + url: 'https://www.huawei.com/', + }, + }, + { + regex: 'Castro 2, Episode Duration Lookup', + name: 'Castro 2', + category: 'Service Agent', + url: 'http://supertop.co/castro/', + producer: { + name: 'Supertop', + url: 'http://supertop.co', + }, + }, + { + regex: 'Curious George', + name: 'Analytics SEO Crawler', + category: 'Crawler', + url: 'http://www.analyticsseo.com/crawler', + producer: { + name: 'Analytics SEO', + url: 'http://www.analyticsseo.com', + }, + }, + { + regex: 'archive\\.org_bot|special_archiver', + name: 'archive.org bot', + category: 'Crawler', + url: 'https://archive.org/details/archive.org_bot', + producer: { + name: 'The Internet Archive', + url: 'https://archive.org', + }, + }, + { + regex: 'Ask Jeeves/Teoma', + name: 'Ask Jeeves', + category: 'Search bot', + url: '', + producer: { + name: 'Ask Jeeves Inc.', + url: 'http://www.ask.com', + }, + }, + { + regex: 'Backlink-Check\\.de', + name: 'Backlink-Check.de', + category: 'Crawler', + url: 'http://www.backlink-check.de/bot.html', + producer: { + name: 'Mediagreen Medienservice', + url: 'http://www.backlink-check.de', + }, + }, + { + regex: 'BacklinkCrawler', + name: 'BacklinkCrawler', + category: 'Crawler', + url: 'http://www.backlinktest.com/crawler.html', + producer: { + name: '2.0Promotion GbR', + url: 'http://www.backlinktest.com', + }, + }, + { + regex: 'Baidu.*spider|baidu Transcoder', + name: 'Baidu Spider', + category: 'Search bot', + url: 'http://www.baidu.com/search/spider.htm', + producer: { + name: 'Baidu', + url: 'http://www.baidu.com', + }, + }, + { + regex: 'BazQux', + name: 'BazQux Reader', + url: 'https://bazqux.com/fetcher', + category: 'Feed Fetcher', + producer: { + name: '', + url: '', + }, + }, + { + regex: 'Better Uptime Bot', + name: 'Better Uptime Bot', + category: 'Site Monitor', + url: 'https://betteruptime.com/faq', + producer: { + name: 'Better Uptime', + url: 'https://betteruptime.com/', + }, + }, + { + regex: + 'MSNBot|msrbot|bingbot|bingadsbot|BingPreview|msnbot-(UDiscovery|NewsBlogs)|adidxbot', + name: 'BingBot', + category: 'Search bot', + url: 'http://search.msn.com/msnbot.htmn', + producer: { + name: 'Microsoft Corporation', + url: 'http://www.microsoft.com', + }, + }, + { + regex: 'Blekkobot', + name: 'Blekkobot', + category: 'Search bot', + url: 'http://blekko.com/about/blekkobot', + producer: { + name: 'Blekko', + url: 'http://blekko.com', + }, + }, + { + regex: 'BLEXBot', + name: 'BLEXBot Crawler', + category: 'Crawler', + url: 'http://webmeup-crawler.com', + producer: { + name: 'WebMeUp', + url: 'http://webmeup.com', + }, + }, + { + regex: 'Bloglovin', + name: 'Bloglovin', + url: 'http://www.bloglovin.com', + category: 'Feed Fetcher', + producer: { + name: '', + url: '', + }, + }, + { + regex: 'Blogtrottr', + name: 'Blogtrottr', + url: '', + category: 'Feed Fetcher', + producer: { + name: 'Blogtrottr Ltd', + url: 'https://blogtrottr.com/', + }, + }, + { + regex: 'BoardReader Blog Indexer', + name: 'BoardReader Blog Indexer', + category: 'Crawler', + producer: { + name: 'BoardReader', + url: 'https://boardreader.com/', + }, + }, + { + regex: 'BountiiBot', + name: 'Bountii Bot', + category: 'Search bot', + url: 'http://bountii.com/contact.php', + producer: { + name: 'Bountii Inc.', + url: 'http://bountii.com', + }, + }, + { + regex: 'Browsershots', + name: 'Browsershots', + category: 'Service Agent', + url: 'http://browsershots.org/faq', + producer: { + name: 'Browsershots.org', + url: 'http://browsershots.org', + }, + }, + { + regex: 'BUbiNG', + name: 'BUbiNG', + category: 'Crawler', + url: 'http://law.di.unimi.it/BUbiNG.html', + producer: { + name: 'The Laboratory for Web Algorithmics (LAW)', + url: 'http://law.di.unimi.it/software.php#buging', + }, + }, + { + regex: '(? { + if (new RegExp(bot.regex).test(ua)) { + return true; + } + return false; + }); + + if (!res) { + return null; + } + + return { + name: res.name, + type: res.category, + }; +} diff --git a/apps/sdk-api/src/routes/event.router.ts b/apps/sdk-api/src/routes/event.router.ts index 0e820dee..61e99a43 100644 --- a/apps/sdk-api/src/routes/event.router.ts +++ b/apps/sdk-api/src/routes/event.router.ts @@ -1,17 +1,24 @@ +import { isBot as isGetBot } from '@/bots'; import * as controller from '@/controllers/event.controller'; import { validateSdkRequest } from '@/utils/auth'; import type { FastifyPluginCallback } from 'fastify'; const eventRouter: FastifyPluginCallback = (fastify, opts, done) => { fastify.addHook('preHandler', (req, reply, done) => { + const isBot = req.headers['user-agent'] + ? isGetBot(req.headers['user-agent']) + : false; + if (isBot) { + reply.log.warn({ ...req.headers, bot: isBot }, 'Bot detected'); + reply.status(202).send('OK'); + } + validateSdkRequest(req.headers) .then((projectId) => { req.projectId = projectId; done(); }) .catch((e) => { - console.log(e); - reply.status(401).send(); }); });