Emergency

Restart Node fails (ubuntu-16.04.3, percona-xtradb-cluster 5.7.19-29.22-3.xenial)

Collapse
X
  • Filter
  • Time
  • Show
Clear All
new posts

  • Restart Node fails (ubuntu-16.04.3, percona-xtradb-cluster 5.7.19-29.22-3.xenial)

    Hi everyone,

    i am currently testing a 2node+arbitrator setup and am running into some problems when restarting a single node.
    After all nodes have joined the cluster, if i restart a single one, it fails to start again and only shows

    " WSREP: Failed to recover position:"

    After cleaning out the whole datadir it will rejoin the cluster just fine.


    wsrep.conf
    Code:
    [mysqld]
    # if cluster is shutdown, and restarted in reverse, try to use IST instead of full SST
    # https://www.percona.com/blog/2016/11/30/galera-cache-gcache-finally-recoverable-restart/
    #wsrep_provider_options="gcache.size=3G"
    
    # Disabling symbolic-links is recommended to prevent assorted security risks
    symbolic-links           = 0
    
    # Path to Galera library
    wsrep_provider           =/usr/lib/galera3/libgalera_smm.so
    
    # In order for Galera to work correctly binlog format should be ROW
    binlog_format            = ROW
    
    # MyISAM storage engine has only experimental support
    default_storage_engine   = InnoDB
    
    # Slave thread to use
    #wsrep_slave_threads      = 16
    
    wsrep_log_conflicts
    
    # This changes how InnoDB autoincrement locks are managed and is a requirement for Galera
    innodb_autoinc_lock_mode =2
    
    # Node IP address
    wsrep_node_address       =  10.10.0.131
    
    # Cluster name
    wsrep_cluster_name       = percona_cluster_fra
    
    #If wsrep_node_name is not specified,  then system hostname will be used
    wsrep_node_name          = db03
    
    #pxc_strict_mode allowed values: DISABLED,PERMISSIVE,ENFORCING,MASTER
    # TODO get devs to fix stuff and switch to ENFORCING
    pxc_strict_mode          = ENFORCING
    
    # SST method
    wsrep_sst_method         = xtrabackup-v2
    
    #Authentication for SST method
    wsrep_sst_auth           = "xtrabackup:password"
    
    # Cluster connection URL contains IPs of nodes
    #If no IP is found, this implies that a new cluster needs to be created,
    #in order to do that you need to bootstrap this node
    wsrep_cluster_address   = gcomm://10.10.0.131,10.10.0.132,10.10.0.101
    
    
    #wsrep_notify_cmd        = /usr/local/bin/galeranotify.py
    my.cnf
    Code:
    # Ansible managed
    #
    # change mysql-prompt
    [mysql]
    prompt                            =\u@db03:[\d]>\_
    
    # Template my.cnf for PXC
    # Edit to your requirements.
    [mysqld]
    user                              = mysql
    server-id                         = 3
    datadir                           = /data/mysql
    tmpdir                            = /tmp
    socket                            = /var/run/mysqld/mysqld.sock
    log-error                         = /var/log/mysqld.log
    pid-file                          = /var/run/mysqld/mysqld.pid
    
    skip-name-resolve
    
    # deactivated bc/keepalived
    # bind_address                    = 10.10.0.131
    
    enforce_gtid_consistency          = 1
    gtid_mode                         = on
    
    # set buffer to 70%
    innodb_buffer_pool_size           = 90069M
    innodb_file_per_table             = ON
    innodb_flush_log_at_trx_commit    = 2
    
    # Logging
    log-bin                           = mysql-bin
    max_binlog_size                   = 300000000
    log_slave_updates
    slow-query-log                    = true
    slow_query_log_file          = /var/log/mysql/mysql-slow.log
    
    
    long_query_time                   = 1
    log_error_verbosity               = 2
    expire_logs_days                  = 4
    
    log_output                        = file
    slow_query_log                    = ON
    long_query_time                   = 1
    log_slow_rate_limit               = 100
    #log_slow_rate_type               = query
    log_slow_verbosity                = full
    log_slow_admin_statements         = ON
    log_slow_slave_statements         = ON
    slow_query_log_always_write_time  = 1
    slow_query_log_use_global_control = all
    innodb_monitor_enable             = all
    userstat                          = 1
    
    explicit_defaults_for_timestamp   = 1
    
    # # --------------------------------------------------------------------------------
    event_scheduler                   = 1
    max_connect_errors                = 16385  # block server after this many unsuccessful connections
    
    # # slave-replication
    # #slave_net_timeout              = 60
    # #binlog_cache_size              = 2M
    # #binlog_stmt_cache_size         = 2M
    
    # # Threading / Processes
    # #
    thread_cache_size                 = 1024
    max_connections                   = 8192
    back_log                          = 512   # default 50 (max. = net.ipv4.tcp_max_syn_backlog = 2048)
    
    # #
    # # ThreadPool
    # #
    thread_handling                   = pool-of-threads
    thread_pool_size                  = 26  # default # of CPUs
    thread_pool_stall_limit           = 500                   # default 500 (ms)
    # thread_pool_max_threads         = 500                   # default 500
    # thread_pool_idle_timeout        = 60                    # default 60 (s)
    
    # #
    # # Query cache
    # #
    query_cache_limit                 = 16M
    query_cache_size                  = 1M
    
    # #
    # # All storage engines
    # #
    tmp_table_size                    = 8192M
    max_heap_table_size               = 8192M
    table_open_cache           = 4000
    Anyone know what the error might be?

    regards,
    Roman

  • #2
    Ok, found the error myself.. seems to be a bug, related to the log_error_verbosity. After initial join restart fails if <=2. So set it to 3 and it works.

    Comment

    Working...
    X