Difference between revisions of "PostgreSQL"

Revision as of 16:40, 8 September 2016

HA PostgreSQL stores the OVirt Engine configuration. It is configured with the following features:

Fail-over of Master
If Master breaks down, RA detects this fault and makes Master to stop, and Slave is promoted to new Master(promote).
Switching between asynchronous and synchronous
If Slave breaks down or LAN has some troubles, the transaction including Write operation will be stopped during the setting of synchronous replication. This means the stop of service. Therefore, RA switches dynamically from the synchronous to the asynchronous replication for prevented from stopping service.
Automated discrimination of data old and new when initial starts
When Pacemaker is started at the same time in the initial starts, RA compares the data of each node using "last xlog replay location" or "last receive replay location" to check which node has the newest data. The node which has the newest data will be Master.

Node 1

su – postgres
mkdir /var/lib/pgsql/pg_archive
cd /var/lib/pgsql/data
initdb

/var/lib/pgsql/data/postgresql.conf

listen_addresses = '*'
max_connections = 200
shared_buffers = 32MB
wal_level = hot_standby
synchronous_commit = on 
archive_mode = on    
archive_command = 'cp %p /var/lib/pgsql/pg_archive/%f'
max_wal_senders = 5
wal_keep_segments = 32
replication_timeout = 9000
hot_standby = on 
max_standby_archive_delay = -1
max_standby_streaming_delay = -1
wal_receiver_status_interval = 2
hot_standby_feedback = on 
logging_collector = on
log_filename = 'postgresql-%a.log'
log_truncate_on_rotation = on 
log_rotation_age = 1d  
log_rotation_size = 0
log_timezone = 'Europe/Berlin'
datestyle = 'iso, mdy'
timezone = 'Europe/Berlin'
lc_messages = 'en_US.UTF-8' 
lc_monetary = 'en_US.UTF-8' 
lc_numeric = 'en_US.UTF-8' 
lc_time = 'en_US.UTF-8' 
default_text_search_config = 'pg_catalog.english'
restart_after_crash = off
include '/var/lib/pgsql/tmp/rep_mode.conf'

Add permissions for PostgreSQL replication and database access over network by editing the
/var/lib/pgsql/pg_hba.conf

local   all             all                                         trust
host    all             all     127.0.0.1/32                        trust
host    all             all     ::1/128                             trust
host    all             all     127.0.0.1/32                        trust
host    all             all     OVIRTM/32                           md5  
host    all             all     CLUSTERLAN_CLUSTER_N1_IP/32         trust
host    all             all     CLUSTERLAN_CLUSTER_N2_IP/32         trust
host    all             all     CLUSTERLAN_CLUSTER_IP/32            trust
host    replication     all     CLUSTERLAN_CLUSTER_N1_IP/32         trust
host    replication     all     CLUSTERLAN_CLUSTER_N2_IP/32         trust

Node 2

su - postgres
rm -rf /var/lib/pgsql/data/*
pg_basebackup -hCLUSTERLAN_CLUSTER_N1_IP -Upostgres -D/var/lib/pgsql/data -Xstream -P
mkdir /var/lib/pgsql/pg_archive

/var/lib/pgsql/data/recover.conf

standby_mode = 'on'
primary_conninfo = 'host=CLUSTERLAN_CLUSTER_N2_IP port=5432 user=postgres
application_name=NODE2'
restore_command = 'cp /var/lib/pgsql/pg_archive/%f %p'
recovery_target_timeline = 'latest'

Prepare for Pacemaker

wget https://raw.github.com/ClusterLabs/resource-\
agents/a6f4ddf76cb4bbc1b3df4c9b6632a6351b63c19e/heartbeat/pgsql
cp pgsql /usr/lib/ocf/resource.d/heartbeat/
chmod 755 /usr/lib/ocf/resource.d/heartbeat/pgsql

For further information see: clusterlabs

Adding as cluster resource

#create resource
pcs resource create pgsql pgsql \
pgctl="/usr/bin/pg_ctl" \
psql="/usr/bin/psql" \
pgdata="/var/lib/pgsql/data/" \
rep_mode="sync" \
node_list="CLUSTER_N1.clan CLUSTER_N2.clan" \
restore_command="cp /var/lib/pgsql/pg_archive/%f %p" \
primary_conninfo_opt="keepalives_idle=60 keepalives_interval=5 \
keepalives_count=5" \
master_ip="CLUSTERLAN_CLUSTER_IP" \
restart_on_promote='true' \
op start   timeout="60s" interval="0s" on-fail="restart" \
op monitor timeout="60s" interval="4s" on-fail="restart" \
op monitor timeout="60s" interval="3s" on-fail="restart" role="Master" \
op promote timeout="60s" interval="0s" on-fail="restart" \
op demote  timeout="60s" interval="0s" on-fail="stop" \
op stop    timeout="60s" interval="0s" on-fail="block" \
op notify  timeout="60s" interval="0s"
#create master slave set
pcs resource master msPostgresql pgsql master-max=1 master-node-max=1 \
    clone-max=2 clone-node-max=1 notify=true

Recovery on failure

If the resource gets stopped
Connect to any of the nodes and try to run the following:

pcs resource disable msPostgresql
#login as postgres
su - postgres
#kill all running instances
killall -sSIGKILL postgres
#try to start it manually
pg_ctl start

Read the logs available at /var/lib/pgsql/data/pg_logs/ and check if the startup was successfull. If not, connect to a different host and try the same thing again. If it is running, connect to the other host and run the following recovery script.
recover.sh

su - postgres << 'EOF'
rm -rf /var/lib/pgsql/data/
pg_basebackup -h OTHER_NODE_IP -U postgres -D /var/lib/pgsql/data -X stream -P
rm /var/lib/pgsql/tmp/PGSQL.lock
EOF
pcs resource cleanup msPostgresql

IF EVERYTHING FAILS, RECOVER FROM BACKUP ;-)