diff --git a/api/app/exceptions.py b/api/app/exceptions.py new file mode 100644 index 000000000000..29452a6f53f2 --- /dev/null +++ b/api/app/exceptions.py @@ -0,0 +1,2 @@ +class ImproperlyConfiguredError(RuntimeError): + pass diff --git a/api/app/routers.py b/api/app/routers.py index b1f149c6f500..3a2341e007a6 100644 --- a/api/app/routers.py +++ b/api/app/routers.py @@ -1,15 +1,79 @@ +import logging import random +from enum import Enum from django.conf import settings +from django.core.cache import cache +from django.db import connections + +from .exceptions import ImproperlyConfiguredError + +logger = logging.getLogger(__name__) + +CONNECTION_CHECK_CACHE_TTL = 2 + + +class ReplicaReadStrategy(Enum): + DISTRIBUTED = "DISTRIBUTED" + SEQUENTIAL = "SEQUENTIAL" + + +def connection_check(database: str) -> bool: + try: + conn = connections.create_connection(database) + conn.connect() + usable = conn.is_usable() + if not usable: + logger.warning( + f"Unable to access database {database} during connection check" + ) + except Exception: + usable = False + logger.error( + "Encountered exception during connection", + exc_info=True, + ) + + if usable: + cache.set( + f"db_connection_active.{database}", "online", CONNECTION_CHECK_CACHE_TTL + ) + else: + cache.set( + f"db_connection_active.{database}", "offline", CONNECTION_CHECK_CACHE_TTL + ) + + return usable class PrimaryReplicaRouter: def db_for_read(self, model, **hints): if settings.NUM_DB_REPLICAS == 0: return "default" - return random.choice( - [f"replica_{i}" for i in range(1, settings.NUM_DB_REPLICAS + 1)] + + replicas = [f"replica_{i}" for i in range(1, settings.NUM_DB_REPLICAS + 1)] + replica = self._get_replica(replicas) + if replica: + # This return is the most likely as replicas should be + # online and properly functioning. + return replica + + # Since no replicas are available, fall back to the cross + # region replicas which have worse availability. + cross_region_replicas = [ + f"cross_region_replica_{i}" + for i in range(1, settings.NUM_CROSS_REGION_DB_REPLICAS + 1) + ] + + cross_region_replica = self._get_replica(cross_region_replicas) + if cross_region_replica: + return cross_region_replica + + # No available replicas, so fallback to the default. + logger.warning( + "Unable to serve any available replicas, falling back to default database" ) + return "default" def db_for_write(self, model, **hints): return "default" @@ -22,6 +86,10 @@ def allow_relation(self, obj1, obj2, **hints): db_set = { "default", *[f"replica_{i}" for i in range(1, settings.NUM_DB_REPLICAS + 1)], + *[ + f"cross_region_replica_{i}" + for i in range(1, settings.NUM_CROSS_REGION_DB_REPLICAS + 1) + ], } if obj1._state.db in db_set and obj2._state.db in db_set: return True @@ -30,6 +98,27 @@ def allow_relation(self, obj1, obj2, **hints): def allow_migrate(self, db, app_label, model_name=None, **hints): return db == "default" + def _get_replica(self, replicas: list[str]) -> None | str: + while replicas: + if settings.REPLICA_READ_STRATEGY == ReplicaReadStrategy.DISTRIBUTED.value: + database = random.choice(replicas) + elif settings.REPLICA_READ_STRATEGY == ReplicaReadStrategy.SEQUENTIAL.value: + database = replicas[0] + else: + raise ImproperlyConfiguredError( + f"Unknown REPLICA_READ_STRATEGY {settings.REPLICA_READ_STRATEGY}" + ) + + replicas.remove(database) + db_cache = cache.get(f"db_connection_active.{database}") + if db_cache == "online": + return database + if db_cache == "offline": + continue + + if connection_check(database): + return database + class AnalyticsRouter: route_app_labels = ["app_analytics"] diff --git a/api/app/settings/common.py b/api/app/settings/common.py index ec551025a4c0..570c027a2596 100644 --- a/api/app/settings/common.py +++ b/api/app/settings/common.py @@ -25,6 +25,7 @@ from django.core.management.utils import get_random_secret_key from environs import Env +from app.routers import ReplicaReadStrategy from task_processor.task_run_method import TaskRunMethod env = Env() @@ -166,6 +167,7 @@ DATABASE_ROUTERS = ["app.routers.PrimaryReplicaRouter"] NUM_DB_REPLICAS = 0 +NUM_CROSS_REGION_DB_REPLICAS = 0 # Allows collectstatic to run without a database, mainly for Docker builds to collectstatic at build time if "DATABASE_URL" in os.environ: DATABASES = { @@ -178,11 +180,37 @@ "REPLICA_DATABASE_URLS", default=[], delimiter=REPLICA_DATABASE_URLS_DELIMITER ) NUM_DB_REPLICAS = len(REPLICA_DATABASE_URLS) + + # Cross region replica databases are used as fallbacks if the + # primary replica set becomes unavailable. + CROSS_REGION_REPLICA_DATABASE_URLS_DELIMITER = env( + "CROSS_REGION_REPLICA_DATABASE_URLS_DELIMITER", "," + ) + CROSS_REGION_REPLICA_DATABASE_URLS = env.list( + "CROSS_REGION_REPLICA_DATABASE_URLS", + default=[], + delimiter=CROSS_REGION_REPLICA_DATABASE_URLS_DELIMITER, + ) + NUM_CROSS_REGION_DB_REPLICAS = len(CROSS_REGION_REPLICA_DATABASE_URLS) + + # DISTRIBUTED spreads the load out across replicas while + # SEQUENTIAL only falls back once the first replica connection is faulty + REPLICA_READ_STRATEGY = env.enum( + "REPLICA_READ_STRATEGY", + type=ReplicaReadStrategy, + default=ReplicaReadStrategy.DISTRIBUTED.value, + ) + for i, db_url in enumerate(REPLICA_DATABASE_URLS, start=1): DATABASES[f"replica_{i}"] = dj_database_url.parse( db_url, conn_max_age=DJANGO_DB_CONN_MAX_AGE ) + for i, db_url in enumerate(CROSS_REGION_REPLICA_DATABASE_URLS, start=1): + DATABASES[f"cross_region_replica_{i}"] = dj_database_url.parse( + db_url, conn_max_age=DJANGO_DB_CONN_MAX_AGE + ) + if "ANALYTICS_DATABASE_URL" in os.environ: DATABASES["analytics"] = dj_database_url.parse( env("ANALYTICS_DATABASE_URL"), conn_max_age=DJANGO_DB_CONN_MAX_AGE diff --git a/api/tests/unit/app/test_unit_app_routers.py b/api/tests/unit/app/test_unit_app_routers.py new file mode 100644 index 000000000000..e7857612efe5 --- /dev/null +++ b/api/tests/unit/app/test_unit_app_routers.py @@ -0,0 +1,196 @@ +from pytest_django.fixtures import SettingsWrapper +from pytest_mock import MockerFixture + +from app.routers import ( + PrimaryReplicaRouter, + ReplicaReadStrategy, + connection_check, +) +from users.models import FFAdminUser + + +def test_connection_check_to_default_database(db: None, reset_cache: None) -> None: + # When + connection_check_works = connection_check("default") + + # Then + assert connection_check_works is True + + +def test_replica_router_db_for_read_with_one_offline_replica( + db: None, + settings: SettingsWrapper, + mocker: MockerFixture, + reset_cache: None, +) -> None: + # Given + settings.NUM_DB_REPLICAS = 4 + + # Set unused cross regional db for testing non-inclusion. + settings.NUM_CROSS_REGION_DB_REPLICAS = 2 + settings.REPLICA_READ_STRATEGY = ReplicaReadStrategy.DISTRIBUTED.value + + conn_patch = mocker.MagicMock() + conn_patch.is_usable.side_effect = (False, True) + create_connection_patch = mocker.patch( + "app.routers.connections.create_connection", return_value=conn_patch + ) + + router = PrimaryReplicaRouter() + + # When + result = router.db_for_read(FFAdminUser) + + # Then + # Read strategy DISTRIBUTED is random, so just this is a check + # against loading the primary or one of the cross region replicas + assert result.startswith("replica_") + + # Check that the number of replica call counts is as expected. + conn_call_count = 2 + assert create_connection_patch.call_count == conn_call_count + assert conn_patch.is_usable.call_count == conn_call_count + + +def test_replica_router_db_for_read_with_local_offline_replicas( + db: None, + settings: SettingsWrapper, + mocker: MockerFixture, + reset_cache: None, +) -> None: + # Given + settings.NUM_DB_REPLICAS = 4 + + # Use cross regional db for fallback after replicas. + settings.NUM_CROSS_REGION_DB_REPLICAS = 2 + settings.REPLICA_READ_STRATEGY = ReplicaReadStrategy.DISTRIBUTED.value + + conn_patch = mocker.MagicMock() + + # All four replicas go offline and so does one of the cross + # regional replica as well, before finally the last cross + # region replica is finally connected to. + conn_patch.is_usable.side_effect = ( + False, + False, + False, + False, + False, + True, + ) + create_connection_patch = mocker.patch( + "app.routers.connections.create_connection", return_value=conn_patch + ) + + router = PrimaryReplicaRouter() + + # When + result = router.db_for_read(FFAdminUser) + + # Then + # Read strategy DISTRIBUTED is random, so just this is a check + # against loading the primary or one of the cross region replicas + assert result.startswith("cross_region_replica_") + + # Check that the number of replica call counts is as expected. + conn_call_count = 6 + assert create_connection_patch.call_count == conn_call_count + assert conn_patch.is_usable.call_count == conn_call_count + + +def test_replica_router_db_for_read_with_all_offline_replicas( + db: None, + settings: SettingsWrapper, + mocker: MockerFixture, + reset_cache: None, +) -> None: + # Given + settings.NUM_DB_REPLICAS = 4 + settings.NUM_CROSS_REGION_DB_REPLICAS = 2 + settings.REPLICA_READ_STRATEGY = ReplicaReadStrategy.DISTRIBUTED.value + + conn_patch = mocker.MagicMock() + + # All replicas go offline. + conn_patch.is_usable.return_value = False + create_connection_patch = mocker.patch( + "app.routers.connections.create_connection", return_value=conn_patch + ) + + router = PrimaryReplicaRouter() + + # When + result = router.db_for_read(FFAdminUser) + + # Then + # Fallback to primary database if all replicas are offline. + assert result == "default" + + # Check that the number of replica call counts is as expected. + conn_call_count = 6 + assert create_connection_patch.call_count == conn_call_count + assert conn_patch.is_usable.call_count == conn_call_count + + +def test_replica_router_db_with_sequential_read( + db: None, + settings: SettingsWrapper, + mocker: MockerFixture, + reset_cache: None, +) -> None: + # Given + settings.NUM_DB_REPLICAS = 100 + settings.NUM_CROSS_REGION_DB_REPLICAS = 2 + settings.REPLICA_READ_STRATEGY = ReplicaReadStrategy.SEQUENTIAL.value + + conn_patch = mocker.MagicMock() + + # First replica is offline, so must fall back to second one. + conn_patch.is_usable.side_effect = (False, True) + create_connection_patch = mocker.patch( + "app.routers.connections.create_connection", return_value=conn_patch + ) + + router = PrimaryReplicaRouter() + + # When + result = router.db_for_read(FFAdminUser) + + # Then + # Fallback from first replica to second one. + assert result == "replica_2" + + # Check that the number of replica call counts is as expected. + conn_call_count = 2 + assert create_connection_patch.call_count == conn_call_count + assert conn_patch.is_usable.call_count == conn_call_count + + +def test_replica_router_db_no_replicas( + db: None, + settings: SettingsWrapper, + mocker: MockerFixture, + reset_cache: None, +) -> None: + # Given + settings.NUM_DB_REPLICAS = 0 + settings.NUM_CROSS_REGION_DB_REPLICAS = 0 + + conn_patch = mocker.MagicMock() + + # All replicas should be ignored. + create_connection_patch = mocker.patch( + "app.routers.connections.create_connection", return_value=conn_patch + ) + + router = PrimaryReplicaRouter() + + # When + result = router.db_for_read(FFAdminUser) + + # Then + # Should always use primary database. + assert result == "default" + conn_call_count = 0 + assert create_connection_patch.call_count == conn_call_count + assert conn_patch.is_usable.call_count == conn_call_count diff --git a/docs/docs/deployment/configuration/sizing-and-scaling.md b/docs/docs/deployment/configuration/sizing-and-scaling.md index 1af69ca28ffc..78e37571e5f3 100644 --- a/docs/docs/deployment/configuration/sizing-and-scaling.md +++ b/docs/docs/deployment/configuration/sizing-and-scaling.md @@ -28,8 +28,40 @@ In terms of auto scaling, we recommend basing the autoscaling off the `ECSServic Our recommendation is to first scale the database up with a more powerful single server. +### Replication + Once the database connections have been saturated by the API cluster, adding read replicas to the database solves the next bottleneck of database connections. +Flagsmith can be set up to handle as many read replicas as needed. To add replicas, you'll need to set the +`REPLICA_DATABASE_URLS` environment variable with a comma separated list of database urls. + +Example: + +``` +REPLICA_DATABASE_URLS: postgres://user:password@replica1.database.host:5432/flagsmith,postgres://user:password@replica2.database.host:5432/flagsmith +``` + +:::tip + +Use the `REPLICA_DATABASE_URLS_DELIMITER` environment variable if you are using any `,` characters in your passwords. + +::: + +In addition to typical read replicas, which usually exist locally in the same data centre to the application. There is +also support for replicas across regions via the `CROSS_REGION_REPLICA_DATABASE_URLS` environment variable which is set +in the same way as the `REPLICA_DATABASE_URLS` with cross region replicas having their own matching +CROSS_REGION_REPLICA_DATABASE_URLS_DELIMITER which also defaults to `,` as above. + +Cross region replicas are only used once all typical replicas have gone offline, since the performance characteristics +wouldn't be favorable to spread replica load at longer latencies. Both `REPLICA_DATABASE_URLS` and +`CROSS_REGION_REPLICA_DATABASE_URLS` can be used alone or simultaneously. + +To support different configurations there are two different replication strategies available. By setting +`REPLICA_READ_STRATEGY` to `DISTRIBUTED` (the default option) the load to the replicas is distributed evenly. If your +use-case, on the otherhand, is to utilize fallback replicas (primary, secondary, etc) the `REPLICA_READ_STRATEGY` should +be set to `SEQUENTIAL` so a replica is only used if all the other replica's preceding it have gone offline. This +strategy is applicable to both typical replicas as well as to cross region replicas. + We would also recommend testing [pgBouncer](https://www.pgbouncer.org/) in your environment as it generally optimises database connections and reduces the load on the database. diff --git a/docs/docs/deployment/hosting/locally-api.md b/docs/docs/deployment/hosting/locally-api.md index 5debcb744f00..ea8181f39147 100644 --- a/docs/docs/deployment/hosting/locally-api.md +++ b/docs/docs/deployment/hosting/locally-api.md @@ -55,23 +55,6 @@ variable called `DATABASE_URL`. This should be configured in the Heroku-ish appl When running the application using Docker, it reads the database configuration from the settings located in `app.settings.production` -### Replication - -Flagsmith can be set up to handle as many read replicas as needed. To add replicas, you'll need to set the -`REPLICA_DATABASE_URLS` environment variable with a comma separated list of database urls. - -Example: - -``` -REPLICA_DATABASE_URLS: postgres://user:password@replica1.database.host:5432/flagsmith,postgres://user:password@replica2.database.host:5432/flagsmith -``` - -:::tip - -Use the `REPLICA_DATABASE_URLS_DELIMITER` environment variable if you are using any `,` characters in your passwords. - -::: - ## Initialising The application is built using django which comes with a handy set of admin pages available at `/admin/`. To access