Search code examples
expressrequesturl-shortener

How to filter meta preview requests from bots for URL shortner?


I built an internal custom URL shortner for my team in node js, however When generated short link is added to a message in Instagram, slack etc. the platform fetches meta tags to display title, image and description for preview to attach with the message.

Issue: This preview request currently is counted as a click within my logic since request to short link increments click count before 301 redirect to long URL

How can I robustly filter social meta preview requests? So that the engagement count is tracked correctly for this built tool.


Solution

  • Not 100% robust but currently this is how i decided to solve this issue:

    step 1: created a list of known bot user-agents

    steps 2: check if incoming request user-agent includes any of the bots user agents if it does then don't increment clicks if doesn't it means its not a bot and then increment clicks

    Code below:

    const KNOWN_BOTS_USER_AGENTS = [
    'google favicon',
    'google keyword',
    'google page speed',
    'google pp default',
    'google search console',
    'google structured data testing tool',
    'google web preview',
    'google-adwords',
    'google-adwords-instant',
    'google-read-aloud',
    'google-structured-data-testing-tool',
    'googlebot',
    'googlebot-image',
    'googlebot-mobile',
    'googlebot-news',
    'googlebot-video',
    'googlebot/images',
    'facebookexternalhit',
    'facebookexternalhit/1.0',
    'facebookexternalhit/1.1',
    'facebookplatform',
    'facebot',
    'twitterbot',
    'bingbot',
    'bingpreview',
    'slurp',
    'duckduckbot',
    'baiduspider',
    'telegrambot',
    'yandexbot',
    'sogou',
    'exabot',
    'linkedinbot',
    'embedly',
    'quora link preview',
    'showyoubot',
    'outbrain',
    'pinterestbot',
    'pinterest/0.',
    'developers.google.com/+/web/snippet',
    'slackbot',
    'vkshare',
    'w3c_validator',
    'redditbot',
    'applebot',
    'whatsapp',
    'flipboard',
    'tumblr',
    'bitlybot',
    'skypeuripreview',
    'nuzzel',
    'discordbot',
    'qwantify',
    'yahoo link preview',
    'yahoo! slurp',
    'yahoo! slurp china',
    'yahoocachesystem',
    'yahooysmcm',
    'baidu',
    'baiduspider-ads',
    'baiduspider-cpro',
    'baiduspider-favo',
    'baiduspider-image',
    'baiduspider-news',
    'baiduspider-video',
    '360spider',
    '360spider-image',
    '360spider-video',
    'aboundex',
    'accoona-ai-agent',
    'acoon',
    'acoonbot',
    'addthis',
    'addthis.com',
    'adidxbot',
    'admantx',
    'adsbot-google',
    'adsbot-google-mobile',
    'adsbot-google-mobile-apps',
    'ahc',
    'ahc/2.0',
    'ahrefsbot',
    'aihit',
    'airmail',
    'akula',
    'alexa',
    'alexabot',
    'amagi',
    'androiddownloadmanager',
    'anemone',
    'apercite',
    'apis-google',
    'applenewsbot',
    'aprcovi',
    'arachmo',
    'archive-com',
    'archive.org_bot',
    'aria2',
    'ask jeeves/teoma',
    'asterias',
    'b-l-i-t-z-bot',
    'backlink-check',
    'base',
    'bazqux',
    'bdfetch',
    'begunadvertising',
    'bibnum.bnf',
    'bigbozz',
    'biglotron',
    'binlar',
    'bitrix link preview',
    'blexbot',
    'bloglovin',
    'blogtrottr',
    'boitho',
    'boitho.com-dc',
    'bolzplatz',
    'browsershots',
    'bubing',
    'bublupbot',
    'butterfly',
    'buzzsumo',
    'bytespider',
    'capsulechecker',
    'cc metadata scaper',
    'ccbot',
    'censysinspect',
    'cerberian drtrs',
    'cg-eye',
    'changedetection',
    'charlotte',
    'checkhost',
    'chrome-lighthouse',
    'cispa vulnerability notification',
    'cjnetworkquality',
    'cliqzbot',
    'cloudflare-alwaysonline',
    'cloudinary',
    'cmcm',
    'coc coc',
    'coccoc',
    'coccocbot-image',
    'coccocbot-web',
    'collections-updater',
    'commons-httpclient',
    'comodo ssl checker',
    'content crawler spider',
    'convera',
    'cookiereports.com',
    'covario-ids',
    'crawl',
    'crawlforlove',
    'crystalsemanticsbot',
    'csimarket',
    'curb',
    'curl',
    'custo',
    'datacha0s',
    'dataparksearch',
    'dataprovider.com',
    'daum',
    'daumoa',
    'dazoobot',
    'deusu',
    'digg',
    'domainappender',
    'dotbot',
    'dotsemantic',
    'downforeveryoneorjustme',
    'drupact',
    'duckduckgo-favicons-bot',
    'earthcom',
    'earthcom.info',
    'easouspider',
    'easy-thumb',
    'ec2linkfinder',
    'ecairn-grabber',
    'eccp',
    'econtext',
    'electricmonk',
    'erocheese',
    'euripbot',
    'europarchive.org',
    'evc-batch',
    'eventmachine httpclient',
    'exploratodo',
    'ezooms',
    'fairshare',
    'faraday v',
    'fast enterprise crawler',
    'fast-webcrawler',
    'favicon',
    'favorg',
    'feed wrangler',
    'feedbin',
    'feedburner',
    'feedchecker',
    'feedfetcher-google',
    'feedly',
    'feedspot',
    'feedwind',
    'femtosearchbot',
    'fetch',
    'fetch api',
    'fever',
    'findlink',
    'findthatfile',
    'findxbot',
    'flamingo_searchengine',
    'flipboardbrowserproxy',
    'fluffy',
    'g00g1e',
    'genieo',
    'getprismatic.com',
    'gigablast',
    'gigablastopensource',
    'gingercrawler',
    'go-http-client',
    'gofetch',
    'gomezagent',
    'goodzer',
    'gotsitemonitor',
    'gozilla',
    'grapeshotcrawler',
    'grouphigh',
    'grub.org',
    'gslfbot',
    'gt::www',
    'gtmetrix',
    'h00p',
    'haosouspider',
    'hatena',
    'hawkreader',
    'heritrix',
    'holmes',
    'hootsuite',
    'hosttracker',
    'ht://check',
    'htdig',
    'http::lite',
    'httrack',
    'hubpages',
    'hubspot connect',
    'hubspot marketing grader',
    'hyperzbozi.cz feeds',
    'i2kconnect',
    'ia_archiver',
    'iaskspider',
    'icc-crawler',
    'ichiro',
    'iecheck',
    'iisbot',
    'infegy',
    'infohelfer',
    'infoseek',
    'infowizards reciprocal link system pro',
    'instapaper',
    'integromedb',
    'iodc',
    'ioi',
    'ips-agent',
    'iqdb',
    'irokez',
    'isitup.org',
    'iskanie',
    'istellabot',
    'izsearch',
    'james bot',
    'janforman',
    'jigsaw',
    'jikespider',
    'jobboersebot',
    'js-kit',
    'justview',
    'k7mlwcbot',
    'keepright openstreetmap checker',
    'keycdn',
    'kickfire',
    'kimonolabs',
    'kml-google',
    'komodiabot',
    'kouio',
    'l.webis',
    'larbin',
    'libwww',
    'liebaofast',
    'link valet',
    'linkcheck',
    'linkdetox',
    'linkdex',
    'linkexaminer',
    'linkpadbot',
    'linktiger',
    'linkvalet',
    'lipperhey',
    'lipperhey spider',
    'livedoor check',
    'loadimpactpageanalyzer',
    'loadimpactrload',
    'longurl api',
    'ltx71',
    'lwp-trivial',
    'lycos',
    'magpierss',
    'mail.ru',
    'mail.ru_bot',
    'mandrill',
    'marketinggrader',
    'mediapartners-google',
    'megaindex',
    'megaindex.ru',
    'metaheadersbot',
    'metauri',
    'metauri api',
    'microsearch',
    'microsoft office existence',
    'microsoft office protocol discovery',
    'microsoft windows network diagnostics',
    'microsoft-rds',
    'mindjet',
    'miniflux',
    'mixrankbot',
    'mj12bot',
    'mnogosearch',
    'mogimogi',
    'mojeek',
    'mojeekbot',
    'mojolicious',
    'montools',
    'moreover',
    'morning paper',
    'mowser',
    'mrcgiguy',
    'msfrontpage',
    'mshots',
    'msnbot',
    'msnbot-media',
    'msnbot-products',
    'msrbot',
    'mvaclient',
    'nagios',
    'najdi.si',
    'netcraftsurveyagent',
    'netlyzer fastprobe',
    'netresearch',
    'netresearchserver',
    'netshelter contentscan',
    'nettrack',
    'netvibes',
    'newsblur',
    'newsgator',
    'newsme',
    'newspaper',
    'ng-search',
    'nineconnections',
    'nineconnections.com',
    'nlnz_iaharvester',
    'nmap scripting engine',
    'noyona',
    'nusearch spider',
    'nutch',
    'nutchcvs',
    'nworm',
    'nymesis',
    'oegp',
    'offline explorer',
    'omea reader',
    'omgili',
    'online domain tools',
    'online link validator',
    'online website link checker',
    'opencalaissemanticproxy',
    'openstat',
    'openvas',
    'optimizer',
    'orangebot',
    'orbiter',
    'orgprobe',
    'ow-02',
    'ow.ly',
    'owlin',
    'owncloud news',
    'page2rss',
    'pagepeeker',
    'pagesinventory',
    'panopta',
    'panscient',
    'paperlibot',
    'peew',
    'phpcrawl',
    'pinterest',
    'piplbot',
    'plukkie',
    'pompos',
    'postano',
    'postpost',
    'postrank',
    'proximic',
    'prtg network monitor',
    'psbot',
    'pump',
    'python-httplib2',
    'python-requests',
    'python-urllib',
    'qirina hurdler',
    'qseero',
    'radian6',
    'rambler',
    'rebelmouse',
    'rel link checker lite',
    'retrevopageanalyzer',
    'riddler',
    'robosourcer',
    'ruby',
    'sbider',
    'scoutjet',
    'scouturlmonitor',
    'scrapy',
    'scrubby',
    'searchsight',
    'semanticdiscovery',
    'semanticjuice',
    'semrushbot',
    'seoengworldbot',
    'seokicks',
    'seopreview',
    'seznam screenshot-generator',
    'seznambot',
    'shopwiki',
    'sitebar',
    'sitecondor',
    'siteexplorer.info',
    'siteinspector',
    'slackbot-linkexpanding',
    'sleuth',
    'smartdownload',
    'smtbot',
    'snappy',
    'snoopy',
    'socialrankiobot',
    'sogou blog',
    'sogou head spider',
    'sogou inst spider',
    'sogou link spider',
    'sogou news spider',
    'sogou orion spider',
    'sogou page spider',
    'sogou partner spider',
    'sogou pic spider',
    'sogou spider',
    'sogou spider2',
    'sogou video spider',
    'sogou web spider',
    'sogou-test-spider',
    'sonic',
    'sortsite',
    'sosospider',
    'spaziodati',
    'spbot',
    'speedy',
    'sputnikbot',
    'sqworm',
    'stackrambler',
    'suggybot',
    'summify',
    'sysomos',
    't0phackteam',
    'tailrank',
    'tarantula',
    'teoma',
    'the architext spider',
    'the expert html source viewer',
    'theoldreader.com',
    'thumbshots',
    'thumbsniper',
    'tineye',
    'tiny tiny rss',
    'tomato bot',
    'topster',
    'touche.com',
    'traackr.com',
    'truwogps',
    'tweetedtimes bot',
    'tweetmemebot',
    'twikle',
    'twingly',
    'twingly recon',
    'unwindfetchor',
    'updated',
    'uptimebot',
    'urlresolver',
    'vagabondo',
    'validator.nu',
    'viber',
    'vivante link checker',
    'vortex',
    'voyager',
    'vyu2',
    'wbsrch',
    'web-archive-net.com.bot',
    'webauto',
    'webcollage',
    'webcookies',
    'webdoc',
    'webimagecollector',
    'webimages',
    'webindex',
    'webkit2png',
    'webmastercoffee',
    'webmeup-crawler',
    'webmon',
    'webscreenie',
    'webster',
    'webstripper',
    'webthumbnail',
    'wesee:ads/pagebot',
    'wesee:search',
    'whack',
    'wire',
    'woriobot',
    'wotbox',
    'wp engine site check',
    'wprecon.com survey',
    'wume_crawler',
    'www-mechanize',
    'xaldon_webspider',
    'xenu link sleuth',
    'xing-contenttabreceiver',
    'xmlrpsee',
    'xovibot',
    'y!j',
    'yacybot',
    'yandeg',
    'yandex',
    'yandexadnet',
    'yandexantivirus',
    'yandexblogs',
    'yandexcatalog',
    'yandexdirect',
    'yandexfavicons',
    'yandexfordomain',
    'yandeximageresizer',
    'yandeximages',
    'yandexmedia',
    'yandexmetrika',
    'yandexmobilebot',
    'yandexnews',
    'yandexscreenshotbot',
    'yandexsearchconsole',
    'yandexspravbot',
    'yandexturbo',
    'yandexverticals',
    'yandexvideo',
    'yandexwebmaster',
    'yasaklibot',
    'yeti',
    'yioopbot',
    'yisouspider',
    'yo-yo',
    'yoleo consumer',
    'yooglifetchagent',
    'yoozbot',
    'youdaobot',
    'zao',
    'zemanta aggregator',
    'zend_http_client',
    'zoominfobot',
    'zyborg',
    ]
    const useragent = req.headers['user-agent'] || null // grab user-agent from request headers
    let isBot = false
    if (
        !useragent ||
        typeof useragent !== 'string' ||
        useragent.toLowerCase().trim().length === 0 ||
        useragent.toLowerCase().trim() === 'undefined' ||
        useragent.toLowerCase().trim() === 'null' ||
        useragent.toLowerCase().trim() === 'empty'
        ) {
         isBot = true
        }
    
    if (!isBot && KNOWN_BOTS_USER_AGENTS.some((botUserAGent) => useragent.toLowerCase().trim().includes(botUserAGent.toLowerCase().trim())){
     isBot = true
    }
    if (isBot) {
        console.log('useragent is a bot --> ', useragent.toLowerCase().trim())
    }