diff --git a/head-node.scm b/head-node.scm index 6fdffb38cceb35dad9a6f267d39daf98224625ca..67401db4fb21b2296b305b2c86b80ab3576497d0 100644 --- a/head-node.scm +++ b/head-node.scm @@ -7,6 +7,7 @@ (use-modules (gnu) ((guix store) #:select (%store-prefix)) + (guix git-download) (guix packages) (guix modules) ((guix utils) #:select (substitute-keyword-arguments)) @@ -349,20 +350,80 @@ CALENDAR, a gexp, and ensures at least FREE-SPACE GiB are available." ;;; NGINX. ;;; +(define ai.robots.txt + (let ((commit "5e7c3c432f8bad894363c7289b888328f98963f3")) + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/ai-robots-txt/ai.robots.txt") + (commit commit))) + (file-name (string-append "ai.robots.txt-" (string-take commit 7) + "-checkout")) + (sha256 + (base32 + "14yblgpnnndzphfi8d6hlc44j7daz0w7x1p55if45jpsz34z1czc"))))) + +(define robot-exclusion-nginx-file + ;; Return an nginx configuration file that can be included in the main file + ;; to return 403 when the user-agent string matches a known AI robot that + ;; does not respect 'robots.txt'. + (computed-file "robot-exclusion.nginx.conf" + (with-imported-modules '((guix build utils)) + #~(begin + (use-modules (guix build utils) + (ice-9 match) + (ice-9 rdelim) + (ice-9 regex)) + + (define (robot-user-agents) + (define prefix + "User-agent: ") + + (call-with-input-file #$(file-append ai.robots.txt + "/robots.txt") + (lambda (port) + (let loop ((user-agents '())) + (match (read-line port) + ((? eof-object?) + (reverse user-agents)) + (line + (if (string-prefix? prefix line) + (loop (cons (string-drop line + (string-length + prefix)) + user-agents)) + (loop user-agents)))))))) + + (call-with-output-file #$output + (lambda (port) + (format port "\ +# Automatically generated from 'ai.robots.txt'. + +if ($http_user_agent ~~ \"(~a)\" ) { + return 403; + break; +}\n" + (string-join (map regexp-quote + (robot-user-agents)) + "|")))))))) + (define %nginx-config (computed-file "nginx-config" (with-imported-modules - '((guix build utils)) - #~(begin - (use-modules (guix build utils)) - - (mkdir #$output) - (chdir #$output) - (symlink #$(local-file "nginx-config/nginx.conf") - "nginx.conf") - (copy-file #$(local-file - "nginx-config/nginx-locations.conf") - "nginx-locations.conf"))))) + '((guix build utils)) + #~(begin + (use-modules (guix build utils)) + + (mkdir #$output) + (chdir #$output) + (copy-file #$(local-file "nginx-config/nginx.conf") + "nginx.conf") + (substitute* "nginx.conf" + (("@ROBOT-EXCLUSION@") + #$robot-exclusion-nginx-file)) + (copy-file #$(local-file + "nginx-config/nginx-locations.conf") + "nginx-locations.conf"))))) (define %nginx-gitlab-token ;; Create /etc/nginx-tokens with a random token if it doesn't exist. diff --git a/nginx-config/nginx.conf b/nginx-config/nginx.conf index ec87f3b6d371d9e655e61f727d7d63d1271bd587..b12a24dc52870b8409ea5361b34607565a52c6d8 100644 --- a/nginx-config/nginx.conf +++ b/nginx-config/nginx.conf @@ -45,6 +45,9 @@ http { add_header X-Frame-Options SAMEORIGIN; + # Return 403 when the user-agent string is that of an AI robot. + include @ROBOT-EXCLUSION@; + include nginx-locations.conf; } @@ -74,6 +77,9 @@ http { add_header Strict-Transport-Security max-age=15552000; add_header X-Frame-Options SAMEORIGIN; + # Return 403 when the user-agent string is that of an AI robot. + include @ROBOT-EXCLUSION@; + include nginx-locations.conf; }