I'm trying to get this coordinator and worker setup up without errors with a pre-made image of postgresql and citus. They are the latest certified version.
(attendance-system-py3.10) ┌<▸> ~/g/a/postgreSQL
└➤ docker ps | grep attendance
fc88b27ae5e7 test-postgres14-full-citus12:latest "docker-entrypoint.s…" 45 minutes ago Up 44 minutes (unhealthy) 0.0.0.0:5433->5432/tcp attendance-coordinator
c7299b6ee214 test-postgres14-full-citus12:latest "docker-entrypoint.s…" 45 minutes ago Up 44 minutes (unhealthy) 0.0.0.0:51379->5432/tcp attendance-worker
6e6e955ed553 hashicorp/consul:1.20 "docker-entrypoint.s…" 45 minutes ago Up 45 minutes (unhealthy) 8300-8302/tcp, 8301-8302/udp, 8600/tcp, 8600/udp, 0.0.0.0:8500->8500/tcp attendance-consul
(attendance-system-py3.10) ┌<▸> ~/g/a/postgreSQL
└➤
On the coordinator I see this:
2025-01-24 09:37:10 psql:/docker-entrypoint-initdb.d/13_citus_verification.sql:8: ERROR: connection to the remote node test_user@attendance-worker:5432 failed with the following error: Connection refused
2025-01-24 09:37:10 Is the server running on that host and accepting TCP/IP connections?
On the worker I see this:
2025-01-24 09:37:10 psql:/docker-entrypoint-initdb.d/13_citus_verification.sql:8: ERROR: connection to the remote node test_user@attendance-worker:5432 failed with the following error: Connection refused
2025-01-24 09:37:10 Is the server running on that host and accepting TCP/IP connections?
2025-01-24 09:37:10 CONTEXT: SQL statement "SELECT citus_add_node('attendance-worker', 5432)"
2025-01-24 09:37:10 PL/pgSQL function inline_code_block line 5 at PERFORM
2025-01-24 09:36:07 schedule
2025-01-24 09:36:07 ----------
2025-01-24 09:36:07 3
2025-01-24 09:36:07 (1 row)
In postgresql.conf it doesn't matter if I set ssl to on or off, it says it can't connect.
Dockerfile
FROM citusdata/citus:postgres_14
RUN apt-get update && \
apt-get install -y gnupg && \
apt-get install -y postgresql-contrib postgresql-14-cron openssl && \
rm -rf /var/lib/apt/lists/*
# Crear directorio para configuraciones y certificados
RUN mkdir -p /etc/postgresql/ssl && \
chown -R postgres:postgres /etc/postgresql/ssl
# Generar certificados SSL autofirmados
RUN cd /etc/postgresql/ssl && \
openssl req -new -x509 \
-days 365 \
-nodes \
-text \
-out server.crt \
-keyout server.key \
-subj "/CN=postgres" \
-addext "subjectAltName = DNS:localhost,IP:127.0.0.1,DNS:*.local" && \
chmod 600 server.key && \
chown postgres:postgres server.key server.crt
# Copiar configuraciones a ubicación temporal
COPY pg_hba.conf /etc/postgresql/
COPY postgresql.conf /etc/postgresql/
# Crear script para copiar configuraciones
RUN echo '#!/bin/bash\n\
cp /etc/postgresql/pg_hba.conf $PGDATA/\n\
cp /etc/postgresql/postgresql.conf $PGDATA/\n\
chmod 600 $PGDATA/pg_hba.conf\n\
chmod 600 $PGDATA/postgresql.conf\n\
chown postgres:postgres $PGDATA/pg_hba.conf\n\
chown postgres:postgres $PGDATA/postgresql.conf' > /docker-entrypoint-initdb.d/00_config_files.sh && \
chmod +x /docker-entrypoint-initdb.d/00_config_files.sh
# Copiar scripts SQL en orden específico
COPY 00_init_db_encryption_capabilites.sql /docker-entrypoint-initdb.d/00_init_db_encryption_capabilites.sql
COPY 01_extensions_and_types.sql /docker-entrypoint-initdb.d/01_extensions_and_types.sql
COPY 02_base_tables.sql /docker-entrypoint-initdb.d/02_base_tables.sql
COPY 03_partitioning.sql /docker-entrypoint-initdb.d/03_partitioning.sql
COPY 04_procedures.sql /docker-entrypoint-initdb.d/04_procedures.sql
COPY 05_query_functions.sql /docker-entrypoint-initdb.d/05_query_functions.sql
COPY 06_maintenance_procedures.sql /docker-entrypoint-initdb.d/06_maintenance_procedures.sql
COPY 07_security_procedures.sql /docker-entrypoint-initdb.d/07_security_procedures.sql
COPY 08_views_and_reports.sql /docker-entrypoint-initdb.d/08_views_and_reports.sql
COPY 09_cleanup_and_optimization.sql /docker-entrypoint-initdb.d/09_cleanup_and_optimization.sql
COPY 11_citus_distribution.sql /docker-entrypoint-initdb.d/11_citus_distribution.sql
COPY 12_init_test_data.sql /docker-entrypoint-initdb.d/12_init_test_data.sql
COPY 13_citus_verification.sql /docker-entrypoint-initdb.d/13_citus_verification.sql
COPY 14_citus_network.sql /docker-entrypoint-initdb.d/14_citus_network.sql
EXPOSE 5432
# Eliminar la referencia explícita al config_file ya que ahora estará en la ubicación por defecto
CMD ["postgres"]
13_citus_verification.sql
-- 13_citus_verification.sql
DO $$
BEGIN
PERFORM pg_sleep(60); -- Ajusta este tiempo según sea necesario
PERFORM citus_set_coordinator_host('attendance-coordinator');
PERFORM citus_add_node('attendance-worker', 5432);
END $$;
-- Función de ayuda para logging
CREATE OR REPLACE FUNCTION log_test_result(test_name text, result text) RETURNS void AS $$
BEGIN
RAISE NOTICE 'Test %: %', test_name, result;
END;
$$ LANGUAGE plpgsql;
-- 1. Pruebas de inserción
DO $$
DECLARE
school_id uuid;
student_id uuid;
tutor_id uuid;
BEGIN
-- Insertar nueva escuela
INSERT INTO schools (name, state, country)
VALUES ('Escuela Test Citus', 'Madrid', 'ES')
RETURNING id INTO school_id;
-- Insertar estudiante
INSERT INTO students (name, school_id)
VALUES ('Estudiante Test Citus', school_id)
RETURNING id INTO student_id;
-- Insertar tutor
INSERT INTO tutors (name, email, school_id)
VALUES ('Tutor Test Citus', '[email protected]', school_id)
RETURNING id INTO tutor_id;
-- Insertar mensajes en diferentes particiones
INSERT INTO messages (claude_conversation_id, student_id, school_id, tutor_id, sender_type, content, created_at)
VALUES
('test-conv-1', student_id, school_id, tutor_id, 'TUTOR', 'Mensaje test 1', '2024-12-15T10:00:00Z'),
('test-conv-2', student_id, school_id, tutor_id, 'SCHOOL', 'Mensaje test 2', '2025-01-15T10:00:00Z'),
('test-conv-3', student_id, school_id, tutor_id, 'CLAUDE', 'Mensaje test 3', '2025-02-15T10:00:00Z');
PERFORM log_test_result('Inserción de datos', 'OK');
EXCEPTION WHEN OTHERS THEN
PERFORM log_test_result('Inserción de datos', 'ERROR: ' || SQLERRM);
RAISE;
END $$;
-- 2. Pruebas de SELECT
DO $$
BEGIN
-- Test 1: Consulta simple por escuela
EXPLAIN ANALYZE
SELECT s.name as school_name, COUNT(m.*) as message_count
FROM schools s
LEFT JOIN messages m ON m.school_id = s.id
GROUP BY s.id, s.name;
PERFORM log_test_result('Consulta agrupada por escuela', 'OK');
-- Test 2: Join complejo con todas las tablas
EXPLAIN ANALYZE
SELECT
s.name as school_name,
st.name as student_name,
t.name as tutor_name,
m.content,
m.created_at
FROM messages m
JOIN schools s ON s.id = m.school_id
JOIN students st ON st.id = m.student_id
JOIN tutors t ON t.id = m.tutor_id
WHERE m.created_at >= '2024-12-01'
ORDER BY m.created_at DESC
LIMIT 10;
PERFORM log_test_result('Join complejo', 'OK');
EXCEPTION WHEN OTHERS THEN
PERFORM log_test_result('Consultas SELECT', 'ERROR: ' || SQLERRM);
RAISE;
END $$;
-- 3. Pruebas de UPDATE
DO $$
DECLARE
test_school_id uuid;
BEGIN
-- Obtener una escuela de prueba
SELECT id INTO test_school_id FROM schools WHERE name = 'Escuela Test Citus';
-- Update en una tabla de referencia
UPDATE schools
SET phone = '123456789'
WHERE id = test_school_id;
-- Update en una tabla distribuida
UPDATE students
SET updated_at = CURRENT_TIMESTAMP
WHERE school_id = test_school_id;
-- Update en tabla particionada
UPDATE messages
SET content = content || ' (actualizado)'
WHERE school_id = test_school_id;
PERFORM log_test_result('Operaciones UPDATE', 'OK');
EXCEPTION WHEN OTHERS THEN
PERFORM log_test_result('Operaciones UPDATE', 'ERROR: ' || SQLERRM);
RAISE;
END $$;
-- 4. Pruebas de DELETE
DO $$
DECLARE
test_school_id uuid;
BEGIN
-- Obtener la escuela de prueba
SELECT id INTO test_school_id FROM schools WHERE name = 'Escuela Test Citus';
-- Eliminar mensajes
DELETE FROM messages WHERE school_id = test_school_id;
-- Eliminar estudiantes
DELETE FROM students WHERE school_id = test_school_id;
-- Eliminar tutores
DELETE FROM tutors WHERE school_id = test_school_id;
-- Eliminar escuela
DELETE FROM schools WHERE id = test_school_id;
PERFORM log_test_result('Operaciones DELETE', 'OK');
EXCEPTION WHEN OTHERS THEN
PERFORM log_test_result('Operaciones DELETE', 'ERROR: ' || SQLERRM);
RAISE;
END $$;
-- 5. Verificación de distribución de datos
SELECT
tableoid::regclass as table_name,
COUNT(*) as row_count
FROM messages
GROUP BY tableoid
ORDER BY table_name;
-- 6. Verificación de rendimiento de joins
EXPLAIN ANALYZE
SELECT
date_trunc('month', m.created_at) as month,
s.name as school_name,
COUNT(DISTINCT st.id) as num_students,
COUNT(DISTINCT t.id) as num_tutors,
COUNT(*) as num_messages
FROM messages m
JOIN schools s ON s.id = m.school_id
JOIN students st ON st.id = m.student_id
JOIN tutors t ON t.id = m.tutor_id
GROUP BY date_trunc('month', m.created_at), s.name
ORDER BY month;
-- 7. Validar estado de los workers
SELECT * FROM citus_get_active_worker_nodes();
-- 8. Verificar distribución de shards
SELECT
logicalrelid::regclass as table_name,
COUNT(DISTINCT shardid) as num_shards
FROM pg_dist_shard
GROUP BY logicalrelid
ORDER BY table_name;
14_citus_network.sql
-- 14_citus_network.sql
DO $$
BEGIN
-- Esperar a que los servicios estén listos
PERFORM pg_sleep(10);
-- Configuración básica coordinator-worker
IF EXISTS (
SELECT 1 FROM pg_roles
WHERE rolname = current_user
AND rolsuper
) THEN
-- Solo intentar añadir el worker si estamos en el coordinator
BEGIN
PERFORM citus_add_node('attendance-worker', 5432);
RAISE NOTICE 'Worker añadido exitosamente';
EXCEPTION WHEN OTHERS THEN
RAISE NOTICE 'Error al añadir worker: %', SQLERRM;
END;
END IF;
-- Verificar la configuración
PERFORM pg_sleep(2);
RAISE NOTICE 'Nodos activos: %', (
SELECT string_agg(node_name || ':' || nodeport, ', ')
FROM citus_get_active_worker_nodes()
);
END $$;
pg_hba.conf
# Database administrative login by Unix domain socket
local all postgres trust
local all test_user trust
# IPv4 local connections
host all all 127.0.0.1/32 scram-sha-256
host all all 0.0.0.0/0 scram-sha-256
host all all 10.0.0.0/8 trust # Cambiado para Citus
host all all 172.16.0.0/12 trust # Cambiado para Citus
host all all 192.168.0.0/16 trust # Cambiado para Citus
host all all 10.0.0.0/24 trust # Cambiado para Citus
# IPv6 local connections
host all all ::1/128 scram-sha-256
# SSL connections
hostssl all all 127.0.0.1/32 scram-sha-256
hostssl all all 0.0.0.0/0 scram-sha-256
hostssl all all 10.0.0.0/8 trust
hostssl all test_user 127.0.0.1/32 trust # Para los tests
postgresql.conf
# Extensiones y características necesarias
shared_preload_libraries='pg_cron, citus' # Para tareas programadas
cron.database_name='test_db' # Base de datos para pg_cron
# Configuración de red
listen_addresses='*'
port=5432
max_connections=100
# Configuración de memoria
shared_buffers=128MB
work_mem=4MB
maintenance_work_mem=64MB
# Configuración de seguridad y encriptación
password_encryption = scram-sha-256 # Método de encriptación para passwords
ssl = on # Habilitar SSL, he probado tambien a ponerlo a off, da igual. El error aparece tanto en el coordinator como en el worker.
ssl_cert_file = '/etc/postgresql/ssl/server.crt'
ssl_key_file = '/etc/postgresql/ssl/server.key'
ssl_prefer_server_ciphers = on # Preferir cifrados del servidor
ssl_min_protocol_version = 'TLSv1.2' # Versión mínima de TLS
ssl_ca_file = ''
# Configuración de registro
log_destination = 'stderr'
logging_collector = on
log_directory = 'pg_log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_rotation_age = 1d
log_rotation_size = 10MB
log_min_messages = DEBUG1
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_checkpoints = on
log_connections = on
log_disconnections = on
log_lock_waits = on
log_temp_files = 0
# Configuración de autovacuum
autovacuum = on
autovacuum_max_workers = 3
autovacuum_naptime = 1min
autovacuum_vacuum_threshold = 50
autovacuum_analyze_threshold = 50
autovacuum_vacuum_scale_factor = 0.2
autovacuum_analyze_scale_factor = 0.1
autovacuum_freeze_max_age = 200000000
autovacuum_multixact_freeze_max_age = 400000000
autovacuum_vacuum_cost_delay = 20ms
autovacuum_vacuum_cost_limit = -1
# WAL y Checkpoints
# Configuración para replicación y sincronización
wal_level = logical # Nivel de WAL para replicación lógica (requerido por Citus)
max_replication_slots = 10 # Número máximo de slots de replicación
max_wal_senders = 10 # Número máximo de procesos WAL senders
max_wal_size = 1GB
min_wal_size = 80MB
checkpoint_timeout = '5min'
# Configuración de estadísticas
track_activities = on
track_counts = on
track_io_timing = on
track_functions = all # Útil para monitorear funciones de encriptación
# Configuración de búsqueda de texto
default_text_search_config = 'pg_catalog.spanish' # Configuración para búsqueda en español
# Configuración de rendimiento para operaciones de encriptación
max_stack_depth = 7MB # Aumentado para funciones de encriptación complejas
huge_pages = try # Utilizar páginas grandes para mejorar rendimiento
docker-compose.yml
services:
consul:
image: hashicorp/consul:1.20
container_name: attendance-consul
env_file:
- .env
environment:
- CONSUL_LOCAL_CONFIG={"verify_incoming":false}
ports:
- "8500:8500"
networks:
- default
volumes:
- consul-data:/consul/data
command: agent -server -bootstrap -ui -bind=10.0.0.2 -client=0.0.0.0 -advertise=10.0.0.2
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:8500/v1/status/leader || exit 1" ]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
deploy:
resources:
limits:
memory: '256M'
cpus: '0.3'
reservations:
memory: '128M'
cpus: '0.1'
coordinator:
image: test-postgres14-full-citus12:latest
container_name: attendance-coordinator
environment:
POSTGRES_USER: test_user
POSTGRES_PASSWORD: test_password
POSTGRES_DB: test_db
POSTGRES_SHARED_BUFFERS: 256MB
POSTGRES_EFFECTIVE_CACHE_SIZE: 768MB
POSTGRES_WORK_MEM: 16MB
POSTGRES_MAINTENANCE_WORK_MEM: 64MB
POSTGRES_MAX_CONNECTIONS: 100
POSTGRES_MAX_WORKER_PROCESSES: 8
POSTGRES_MAX_PARALLEL_WORKERS: 8
POSTGRES_MAX_PARALLEL_WORKERS_PER_GATHER: 4
networks:
- default
ports:
- "5433:5432"
volumes:
- coordinator_data:/var/lib/postgresql/data
depends_on:
#worker:
# condition: service_healthy
- consul
# condition: service_healthy
deploy:
replicas: 1
resources:
limits:
cpus: '1.5'
memory: '1G'
reservations:
cpus: '0.5'
memory: '512M'
placement:
constraints:
- node.role == manager
update_config:
parallelism: 1
delay: 120s
order: start-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
command: >
postgres
-c config_file=/etc/postgresql/postgresql.conf
-c maintenance_work_mem=128MB
-c synchronous_commit=off
-c checkpoint_timeout=30min
-c max_wal_size=2GB
healthcheck:
test: >
CMD-SHELL
pg_isready -U test_user -d test_db &&
psql -U test_user -d test_db -c "SELECT 1 FROM citus_dist_local_group" &&
psql -U test_user -d test_db -c "SELECT * FROM citus_health_check()"
interval: 10s
timeout: 5s
retries: 5
start_period: 60s
worker:
image: test-postgres14-full-citus12:latest
container_name: attendance-worker
environment:
POSTGRES_USER: test_user
POSTGRES_PASSWORD: test_password
POSTGRES_DB: test_db
POSTGRES_SHARED_BUFFERS: 256MB
POSTGRES_EFFECTIVE_CACHE_SIZE: 768MB
POSTGRES_WORK_MEM: 16MB
POSTGRES_MAINTENANCE_WORK_MEM: 64MB
POSTGRES_MAX_CONNECTIONS: 50
POSTGRES_MAX_WORKER_PROCESSES: 4
POSTGRES_MAX_PARALLEL_WORKERS: 4
POSTGRES_MAX_PARALLEL_WORKERS_PER_GATHER: 2
POSTGRES_HOST_AUTH_METHOD: trust
POSTGRES_LISTEN_ADDRESSES: '*'
networks:
- default
ports:
- "5432"
volumes:
- worker_data:/var/lib/postgresql/data
depends_on:
- consul
deploy:
mode: replicated
replicas: 1
resources:
limits:
cpus: '1.0'
memory: '1G'
reservations:
cpus: '0.25'
memory: '512M'
update_config:
parallelism: 1
delay: 60s
order: start-first
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
placement:
constraints:
- node.role != manager
command: >
postgres
-c config_file=/etc/postgresql/postgresql.conf
-c listen_addresses='*'
-c port=5432
-c maintenance_work_mem=64MB
-c synchronous_commit=off
-c checkpoint_timeout=30min
-c max_wal_size=1GB
healthcheck:
test: >
CMD-SHELL
pg_isready -U test_user -d test_db &&
psql -U test_user -d test_db -c "SELECT 1 FROM pg_extension WHERE extname = 'citus'" &&
psql -U test_user -d test_db -c "SELECT pg_is_ready_to_accept_connections()"
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
nginx:
image: nginx:latest
container_name: attendance-nginx
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/conf/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/conf/conf.d:/etc/nginx/conf.d:ro
- ./nginx/html:/usr/share/nginx/html:ro
- ./nginx/static:/usr/share/nginx/static:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
networks:
- default
environment:
- NGINX_ENTRYPOINT_QUIET_LOGS=1
depends_on:
- front
- back
- consul
healthcheck:
test: [ "CMD", "nginx", "-t" ]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
deploy:
replicas: 1
resources:
limits:
memory: '256M'
cpus: '0.3'
reservations:
memory: '128M'
cpus: '0.1'
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
volumes:
vault-data:
driver: local
prometheus-data:
driver: local
grafana-data:
driver: local
consul-data:
driver: local
coordinator_data:
driver: local
worker_data:
driver: local
networks:
default:
driver: overlay
ipam:
config:
- subnet: 10.0.0.0/24
gateway: 10.0.0.1
attachable: true
The problem is that you are calling citus_add_node()
UDF way to early, without making sure the new node is available and ready to accept connections.
I suggest you move Citus related UDF calls after the relevant worker nodes are healthy. We (at Citus team) had a docker-compose.yml that has an extra service called manager
that runs all the necessary citus_add_node()
queries only after the nodes are ready to accept connections.
If you are curious about the definition of the source of that manager
image, see membership-manager repo.