From 647712caf6e4e61eac18fca0a38ca7ee2beaadef Mon Sep 17 00:00:00 2001
From: Kim Gustyr <kim.gustyr@flagsmith.com>
Date: Wed, 5 Mar 2025 10:14:13 +0000
Subject: [PATCH] feat: Switch existing task processor health checks to new
 liveness probe (#5161)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Rodrigo López Dato <rodrigo.lopezdato@flagsmith.com>
---
 Dockerfile                                    |  11 +-
 api/app/settings/common.py                    |   5 +-
 api/app/urls.py                               | 103 +++++++++---------
 api/manage.py                                 |   7 ++
 api/poetry.lock                               |   7 +-
 api/pyproject.toml                            |   4 +-
 api/scripts/__init__.py                       |   0
 api/scripts/healthcheck.py                    |  24 +++-
 api/scripts/run-docker.sh                     |  23 ++--
 docker-compose.pgpool.yml                     |   6 -
 docker-compose.yml                            |   6 -
 .../configuration/task-processor.md           |  41 +++----
 12 files changed, 130 insertions(+), 107 deletions(-)
 create mode 100644 api/scripts/__init__.py

diff --git a/Dockerfile b/Dockerfile
index 3315e5072267..a513c6d9a495 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -19,11 +19,11 @@
 # Build a SaaS API image:
 # $ GH_TOKEN=$(gh auth token) docker build -t flagsmith-saas-api:dev --target saas-api \
 #     --secret="id=sse_pgp_pkey,src=./sse_pgp_pkey.key"\
-#     --secret="id=github_private_cloud_token,env=GH_TOKEN" . 
+#     --secret="id=github_private_cloud_token,env=GH_TOKEN" .
 
 # Build a Private Cloud Unified image:
 # $ GH_TOKEN=$(gh auth token) docker build -t flagsmith-private-cloud:dev --target private-cloud-unified \
-#     --secret="id=github_private_cloud_token,env=GH_TOKEN" . 
+#     --secret="id=github_private_cloud_token,env=GH_TOKEN" .
 
 # Table of Contents
 # Stages are described as stage-name [dependencies]
@@ -90,7 +90,7 @@ ARG PYTHON_VERSION
 RUN apk add build-base linux-headers curl git \
   python-${PYTHON_VERSION} \
   python-${PYTHON_VERSION}-dev \
-  py${PYTHON_VERSION}-pip 
+  py${PYTHON_VERSION}-pip
 
 COPY api/pyproject.toml api/poetry.lock api/Makefile ./
 ENV POETRY_VIRTUALENVS_IN_PROJECT=true \
@@ -119,7 +119,7 @@ FROM wolfi-base AS api-runtime
 
 # Install Python and make it available to venv entrypoints
 ARG PYTHON_VERSION
-RUN apk add python-${PYTHON_VERSION} && \
+RUN apk add curl python-${PYTHON_VERSION} && \
   mkdir /build/ && ln -s /usr/local/ /build/.venv
 
 WORKDIR /app
@@ -139,6 +139,9 @@ ENTRYPOINT ["/app/scripts/run-docker.sh"]
 
 CMD ["migrate-and-serve"]
 
+HEALTHCHECK --interval=2s --timeout=2s --retries=3 --start-period=20s \
+  CMD curl -f http://localhost:8000/health/liveness || exit 1
+
 # * api-runtime-private [api-runtime]
 FROM api-runtime AS api-runtime-private
 
diff --git a/api/app/settings/common.py b/api/app/settings/common.py
index 2aac10cf9758..a026cf143009 100644
--- a/api/app/settings/common.py
+++ b/api/app/settings/common.py
@@ -986,6 +986,9 @@
 # Used to control the size(number of identities) of the project that can be self migrated to edge
 MAX_SELF_MIGRATABLE_IDENTITIES = env.int("MAX_SELF_MIGRATABLE_IDENTITIES", 100000)
 
+# RUN_BY_PROCESSOR is set by the task processor entrypoint
+TASK_PROCESSOR_MODE = env.bool("RUN_BY_PROCESSOR", False)
+
 # Setting to allow asynchronous tasks to be run synchronously for testing purposes
 # or in a separate thread for self-hosted users
 TASK_RUN_METHOD = env.enum(
@@ -993,7 +996,7 @@
     type=TaskRunMethod,
     default=(
         TaskRunMethod.TASK_PROCESSOR.value
-        if env.bool("RUN_BY_PROCESSOR", False)
+        if TASK_PROCESSOR_MODE
         else TaskRunMethod.SEPARATE_THREAD.value
     ),
 )
diff --git a/api/app/urls.py b/api/app/urls.py
index cc02e5fe0b55..b04dd45a0302 100644
--- a/api/app/urls.py
+++ b/api/app/urls.py
@@ -10,67 +10,72 @@
 from . import views
 
 urlpatterns = [
-    re_path(r"^api/v1/", include("api.urls.deprecated", namespace="api-deprecated")),
-    re_path(r"^api/v1/", include("api.urls.v1", namespace="api-v1")),
-    re_path(r"^api/v2/", include("api.urls.v2", namespace="api-v2")),
-    re_path(r"^admin/", admin.site.urls),
     re_path(r"^health/liveness/?", views.version_info),
     re_path(r"^health/readiness/?", include("health_check.urls")),
-    re_path(r"^health", include("health_check.urls", namespace="health")),
-    # Aptible health checks must be on /healthcheck and cannot redirect
-    # see https://www.aptible.com/docs/core-concepts/apps/connecting-to-apps/app-endpoints/https-endpoints/health-checks
-    path("healthcheck", include("health_check.urls", namespace="aptible")),
-    re_path(r"^version", views.version_info, name="version-info"),
-    re_path(
-        r"^sales-dashboard/",
-        include("sales_dashboard.urls", namespace="sales_dashboard"),
-    ),
-    # this url is used to generate email content for the password reset workflow
-    re_path(
-        r"^password-reset/confirm/(?P<uidb64>[0-9A-Za-z_\-]+)/(?P<token>[0-9A-Za-z]{1,"
-        r"13}-[0-9A-Za-z]{1,20})/$",
-        password_reset_redirect,
-        name="password_reset_confirm",
-    ),
-    re_path(
-        r"^config/project-overrides",
-        views.project_overrides,
-        name="project_overrides",
-    ),
     path("processor/", include("task_processor.urls")),
-    path(
-        "robots.txt",
-        TemplateView.as_view(template_name="robots.txt", content_type="text/plain"),
-    ),
 ]
 
-if settings.DEBUG:
-    import debug_toolbar  # type: ignore[import-untyped,unused-ignore]
+if not settings.TASK_PROCESSOR_MODE:
+    urlpatterns += [
+        re_path(
+            r"^api/v1/", include("api.urls.deprecated", namespace="api-deprecated")
+        ),
+        re_path(r"^api/v1/", include("api.urls.v1", namespace="api-v1")),
+        re_path(r"^api/v2/", include("api.urls.v2", namespace="api-v2")),
+        re_path(r"^admin/", admin.site.urls),
+        re_path(r"^health", include("health_check.urls", namespace="health")),
+        # Aptible health checks must be on /healthcheck and cannot redirect
+        # see https://www.aptible.com/docs/core-concepts/apps/connecting-to-apps/app-endpoints/https-endpoints/health-checks
+        path("healthcheck", include("health_check.urls", namespace="aptible")),
+        re_path(r"^version", views.version_info, name="version-info"),
+        re_path(
+            r"^sales-dashboard/",
+            include("sales_dashboard.urls", namespace="sales_dashboard"),
+        ),
+        # this url is used to generate email content for the password reset workflow
+        re_path(
+            r"^password-reset/confirm/(?P<uidb64>[0-9A-Za-z_\-]+)/(?P<token>[0-9A-Za-z]{1,"
+            r"13}-[0-9A-Za-z]{1,20})/$",
+            password_reset_redirect,
+            name="password_reset_confirm",
+        ),
+        re_path(
+            r"^config/project-overrides",
+            views.project_overrides,
+            name="project_overrides",
+        ),
+        path(
+            "robots.txt",
+            TemplateView.as_view(template_name="robots.txt", content_type="text/plain"),
+        ),
+    ]
 
+if settings.DEBUG:
     urlpatterns = [
-        re_path(r"^__debug__/", include(debug_toolbar.urls)),  # type: ignore[attr-defined,unused-ignore]
+        re_path(r"^__debug__/", include("debug_toolbar.urls")),
     ] + urlpatterns
 
-if settings.SAML_INSTALLED:
-    urlpatterns.append(path("api/v1/auth/saml/", include("saml.urls")))
+if settings.SAML_INSTALLED:  # pragma: no cover
+    urlpatterns += [
+        path("api/v1/auth/saml/", include("saml.urls")),
+    ]
 
 if settings.WORKFLOWS_LOGIC_INSTALLED:  # pragma: no cover
     workflow_views = importlib.import_module("workflows_logic.views")
-    urlpatterns.extend(
-        [
-            path("api/v1/features/workflows/", include("workflows_logic.urls")),
-            path(
-                "api/v1/environments/<str:environment_api_key>/create-change-request/",
-                workflow_views.create_change_request,
-                name="create-change-request",
-            ),
-            path(
-                "api/v1/environments/<str:environment_api_key>/list-change-requests/",
-                workflow_views.list_change_requests,
-                name="list-change-requests",
-            ),
-        ]
-    )
+    urlpatterns += [
+        path("api/v1/features/workflows/", include("workflows_logic.urls")),
+        path(
+            "api/v1/environments/<str:environment_api_key>/create-change-request/",
+            workflow_views.create_change_request,
+            name="create-change-request",
+        ),
+        path(
+            "api/v1/environments/<str:environment_api_key>/list-change-requests/",
+            workflow_views.list_change_requests,
+            name="list-change-requests",
+        ),
+    ]
+
 
 if settings.SERVE_FE_ASSETS:  # pragma: no cover
     # add route to serve FE assets for any unrecognised paths
diff --git a/api/manage.py b/api/manage.py
index d82499ec22b5..be05e641ce59 100755
--- a/api/manage.py
+++ b/api/manage.py
@@ -3,6 +3,13 @@
 import sys
 
 if __name__ == "__main__":
+    # Backwards compatibility for task-processor health checks
+    # See https://github.com/Flagsmith/flagsmith-task-processor/issues/24
+    if "checktaskprocessorthreadhealth" in sys.argv:
+        import scripts.healthcheck
+
+        scripts.healthcheck.main()
+
     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "app.settings.local")
 
     try:
diff --git a/api/poetry.lock b/api/poetry.lock
index 4ce3700325d0..e2fe363174da 100644
--- a/api/poetry.lock
+++ b/api/poetry.lock
@@ -2000,14 +2000,15 @@ dj-database-url = "~0.5.0"
 django-health-check = "~3.18.2"
 drf-yasg = "~1.21.6"
 environs = "~9.2.0"
+gunicorn = "*"
 psycopg2-binary = "~2.9.5"
 simplejson = "~3.19.1"
 
 [package.source]
 type = "git"
 url = "https://github.com/Flagsmith/flagsmith-task-processor"
-reference = "v1.2.2"
-resolved_reference = "c833e2239b40d9aca813e7a2aedfb55751b031c2"
+reference = "v1.3.1"
+resolved_reference = "fffd43bc92bfe477fbc1e4ab435610b6e23639d2"
 
 [[package]]
 name = "freezegun"
@@ -5198,4 +5199,4 @@ files = [
 [metadata]
 lock-version = "2.1"
 python-versions = ">3.11,<3.13"
-content-hash = "1e9641df3e4e44aab9ed2537d8130695837c5ef5b0bd57f2d1acdb78efe397a2"
+content-hash = "3263dce1b5b4fdad2c7fcb52e98e4fec1f77bb44ae5edb7f430a84318fae6bef"
diff --git a/api/pyproject.toml b/api/pyproject.toml
index d3611f0eee71..a938204801c7 100644
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -63,7 +63,7 @@ exclude_also = [
 ]
 
 [tool.coverage.run]
-omit = ["scripts/*"]
+omit = ["scripts/*", "manage.py"]
 
 [tool.pytest.ini_options]
 addopts = ['--ds=app.settings.test', '-vvvv', '-p', 'no:warnings']
@@ -146,7 +146,7 @@ pygithub = "2.1.1"
 hubspot-api-client = "^8.2.1"
 djangorestframework-dataclasses = "^1.3.1"
 pyotp = "^2.9.0"
-flagsmith-task-processor = { git = "https://github.com/Flagsmith/flagsmith-task-processor", tag = "v1.2.2" }
+flagsmith-task-processor = { git = "https://github.com/Flagsmith/flagsmith-task-processor", tag = "v1.3.1" }
 flagsmith-common = { git = "https://github.com/Flagsmith/flagsmith-common", tag = "v1.4.2" }
 tzdata = "^2024.1"
 djangorestframework-simplejwt = "^5.3.1"
diff --git a/api/scripts/__init__.py b/api/scripts/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/api/scripts/healthcheck.py b/api/scripts/healthcheck.py
index 6a54a0a6680b..c790b168cd70 100644
--- a/api/scripts/healthcheck.py
+++ b/api/scripts/healthcheck.py
@@ -1,8 +1,26 @@
+import logging
 import sys
 
 import requests
 
-url = "http://localhost:8000/health/liveness"
-status = requests.get(url).status_code
+HEALTH_LIVENESS_URL = "http://localhost:8000/health/liveness"
 
-sys.exit(0 if 200 >= status < 300 else 1)
+
+logger = logging.getLogger(__name__)
+
+
+def main() -> None:
+    logger.warning(
+        f"This healthcheck, invoked by {' '.join(sys.argv)}, is deprecated. "
+        f"Use the `{HEALTH_LIVENESS_URL}` endpoint instead."
+    )
+    status_code = requests.get(HEALTH_LIVENESS_URL).status_code
+
+    if status_code != 200:
+        logger.error(f"Health check failed with status {status_code}")
+
+    sys.exit(0 if 200 >= status_code < 300 else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/api/scripts/run-docker.sh b/api/scripts/run-docker.sh
index fb9e0a648787..eb2bfc389231 100755
--- a/api/scripts/run-docker.sh
+++ b/api/scripts/run-docker.sh
@@ -1,6 +1,10 @@
 #!/bin/sh
 set -e
 
+# common environment variables
+ACCESS_LOG_FORMAT=${ACCESS_LOG_FORMAT:-'%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %({origin}i)s %({access-control-allow-origin}o)s'}
+GUNICORN_LOGGER_CLASS=${GUNICORN_LOGGER_CLASS:-'util.logging.GunicornJsonCapableLogger'}
+
 waitfordb() {
   if [ -z "${SKIP_WAIT_FOR_DB}" ]; then
      python manage.py waitfordb "$@"
@@ -27,8 +31,8 @@ serve() {
              --workers ${GUNICORN_WORKERS:-3} \
              --threads ${GUNICORN_THREADS:-2} \
              --access-logfile $ACCESS_LOG_LOCATION \
-             --logger-class ${GUNICORN_LOGGER_CLASS:-'util.logging.GunicornJsonCapableLogger'} \
-             --access-logformat ${ACCESS_LOG_FORMAT:-'%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %({origin}i)s %({access-control-allow-origin}o)s'} \
+             --logger-class $GUNICORN_LOGGER_CLASS \
+             --access-logformat "$ACCESS_LOG_FORMAT" \
              --keep-alive ${GUNICORN_KEEP_ALIVE:-2} \
              ${STATSD_HOST:+--statsd-host $STATSD_HOST:$STATSD_PORT} \
              ${STATSD_HOST:+--statsd-prefix $STATSD_PREFIX} \
@@ -39,11 +43,16 @@ run_task_processor() {
     if [ -n "$ANALYTICS_DATABASE_URL" ] || [ -n "$DJANGO_DB_NAME_ANALYTICS" ]; then
         waitfordb --waitfor 30 --migrations --database analytics
     fi
-    RUN_BY_PROCESSOR=1 exec python manage.py runprocessor \
-      --sleepintervalms ${TASK_PROCESSOR_SLEEP_INTERVAL:-500} \
-      --graceperiodms ${TASK_PROCESSOR_GRACE_PERIOD_MS:-20000} \
-      --numthreads ${TASK_PROCESSOR_NUM_THREADS:-5} \
-      --queuepopsize ${TASK_PROCESSOR_QUEUE_POP_SIZE:-10}
+    RUN_BY_PROCESSOR=1 python manage.py runprocessor \
+             --sleepintervalms ${TASK_PROCESSOR_SLEEP_INTERVAL_MS:-${TASK_PROCESSOR_SLEEP_INTERVAL:-500}} \
+             --graceperiodms ${TASK_PROCESSOR_GRACE_PERIOD_MS:-20000} \
+             --numthreads ${TASK_PROCESSOR_NUM_THREADS:-5} \
+             --queuepopsize ${TASK_PROCESSOR_QUEUE_POP_SIZE:-10} \
+             gunicorn \
+             --bind 0.0.0.0:8000 \
+             --access-logfile $ACCESS_LOG_LOCATION \
+             --logger-class $GUNICORN_LOGGER_CLASS \
+             --access-logformat "$ACCESS_LOG_FORMAT"
 }
 migrate_analytics_db(){
     # if `$ANALYTICS_DATABASE_URL` or DJANGO_DB_NAME_ANALYTICS is set
diff --git a/docker-compose.pgpool.yml b/docker-compose.pgpool.yml
index 454f812196f4..fac0169251d4 100644
--- a/docker-compose.pgpool.yml
+++ b/docker-compose.pgpool.yml
@@ -34,12 +34,6 @@ services:
       TASK_RUN_METHOD: TASK_PROCESSOR # other options are: SYNCHRONOUSLY, SEPARATE_THREAD (default)
     ports:
       - 8000:8000
-    healthcheck:
-      test: ['CMD-SHELL', 'python /app/scripts/healthcheck.py']
-      interval: 2s
-      timeout: 2s
-      retries: 20
-      start_period: 20s
     depends_on:
       pgpool:
         condition: service_healthy
diff --git a/docker-compose.yml b/docker-compose.yml
index 2c25b4032850..1ceb6252e8de 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -57,12 +57,6 @@ services:
       # EMAIL_USE_TLS: 'true' # optional
     ports:
       - 8000:8000
-    healthcheck:
-      test: ['CMD-SHELL', 'python /app/scripts/healthcheck.py']
-      interval: 2s
-      timeout: 2s
-      retries: 20
-      start_period: 20s
     depends_on:
       postgres:
         condition: service_healthy
diff --git a/docs/docs/deployment/configuration/task-processor.md b/docs/docs/deployment/configuration/task-processor.md
index 9da90587812c..170b009b3203 100644
--- a/docs/docs/deployment/configuration/task-processor.md
+++ b/docs/docs/deployment/configuration/task-processor.md
@@ -47,44 +47,33 @@ flagsmith_processor:
 ## Configuring the Processor
 
 The processor exposes a number of configuration options to tune the processor to your needs / setup. These configuration
-options are via command line arguments when starting the processor.
+options are via command line arguments when starting the processor. The default Flagsmith container entrypoint expects
+these options as `TASK_PROCESSOR_`-prefixed environment variables.
 
-| Argument            | Description                                                               | Default |
-| ------------------- | ------------------------------------------------------------------------- | ------- |
-| `--sleepintervalms` | The amount of ms each worker should sleep between checking for a new task | 2000    |
-| `--numthreads`      | The number of worker threads to run per task processor instance           | 5       |
-| `--graceperiodms`   | The amount of ms before a worker thread is considered 'stuck'.            | 20000   |
+| Environment variable               | Argument            | Description                                                                | Default |
+| ---------------------------------- | ------------------- | -------------------------------------------------------------------------- | ------- |
+| `TASK_PROCESSOR_SLEEP_INTERVAL_MS` | `--sleepintervalms` | The amount of ms each worker should sleep between checking for a new task. | 500     |
+| `TASK_PROCESSOR_NUM_THREADS`       | `--numthreads`      | The number of worker threads to run per task processor instance.           | 5       |
+| `TASK_PROCESSOR_GRACE_PERIOD_MS`   | `--graceperiodms`   | The amount of ms before a worker thread is considered 'stuck'.             | 20000   |
+| `TASK_PROCESSOR_QUEUE_POP_SIZE`    | `--queuepopsize`    | The number of enqueued tasks to retrieve for processing for one iteration. | 10      |
 
 ## Monitoring
 
 There are a number of options for monitoring the task processor's health.
 
-### Checking Thread / Worker Health
+### Health checks
 
-The task processor includes a management command which checks the health of the worker threads which are running tasks.
-
-```
-python manage.py checktaskprocessorthreadhealth
-```
-
-The command will exit with either a success exit code (0) or a failure exit code (1).
-
-### API to Task Processor health
-
-To monitor that the API can send tasks to the processor and that they are successfully run, there is a custom health
-check which can be enabled on the general health endpoint (`GET /health?format=json`). This health check needs to be
-enabled manually, which can be done by setting the `ENABLE_TASK_PROCESSOR_HEALTH_CHECK` environment variable to `True`
-(in the flagsmith application container, not the task processor). Note that this health check is not considered
-"critical" and hence, the endpoint will return a 200 OK regardless of whether the task processor is sucessfully
-processing tasks or not.
+A task processor container exposes `/health/readiness` and `/health/liveness` endpoints for readiness and liveness
+probes. The endpoints run simple availability checks. To include a test that enqueues a task and makes sure it's run
+to your readiness probe, set `ENABLE_TASK_PROCESSOR_HEALTH_CHECK` environment variable to `True`.
 
 ### Task statistics
 
-Within the API, there is an endpoint which returns, in JSON format, statistics about the tasks consumed / to be consumed
-by the processor. This endpoint is available at `GET /processor/monitoring`. This will respond with the following data:
+Both API and Task processor expose an endpoint which returns Task processor statistics in JSON format.
+This endpoint is available at `GET /processor/monitoring`. See an example response below:
 
 ```json
 {
- "waiting": 1
+ "waiting": 1  // The number of tasks waiting in the queue.
 }
 ```