CUBRID Engine  latest
master_heartbeat.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008 Search Solution Corporation
3  * Copyright 2016 CUBRID Corporation
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  */
18 
19 /*
20  * master_heartbeat.c - heartbeat module in cub_master
21  */
22 
23 #ident "$Id$"
24 
25 
26 #include "config.h"
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/time.h>
32 #include <netdb.h>
33 #include <sys/types.h>
34 #include <sys/socket.h>
35 #include <netinet/in.h>
36 #include <arpa/inet.h>
37 #include <signal.h>
38 #include <errno.h>
39 #include <sys/wait.h>
40 #include <assert.h>
41 
42 #if !defined(WINDOWS)
43 #include <unistd.h>
44 #include <fcntl.h>
45 #include <pthread.h>
46 #include <syslog.h>
47 #endif
48 
49 #include "connection_cl.h"
50 #include "dbi.h"
51 #include "environment_variable.h"
52 #include "error_context.hpp"
53 #include "heartbeat.h"
54 #include "master_util.h"
55 #include "master_heartbeat.h"
56 #include "master_request.h"
57 #include "message_catalog.h"
58 #include "object_representation.h"
59 #include "porting.h"
60 #include "tcp.h"
61 #include "utility.h"
62 
63 #define HB_INFO_STR_MAX 8192
64 #define SERVER_DEREG_MAX_POLL_COUNT 10
65 
66 #define ENTER_FUNC() \
67 do { \
68  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "%s : enter", __func__); \
69 } while(0);
70 
71 #define EXIT_FUNC() \
72 do { \
73  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "%s : exit", __func__); \
74 } while(0);
75 
78 {
82 };
83 
84 /* list */
85 static void hb_list_add (HB_LIST ** p, HB_LIST * n);
86 static void hb_list_remove (HB_LIST * n);
87 static void hb_list_move (HB_LIST ** dest_pp, HB_LIST ** source_pp);
88 
89 /* jobs */
90 static void hb_add_timeval (struct timeval *tv_p, unsigned int msec);
91 static int hb_compare_timeval (struct timeval *arg1, struct timeval *arg2);
92 static const char *hb_strtime (char *s, unsigned int max, struct timeval *tv_p);
93 
94 static int hb_job_queue (HB_JOB * jobs, unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec);
95 static HB_JOB_ENTRY *hb_job_dequeue (HB_JOB * jobs);
96 static void hb_job_set_expire_and_reorder (HB_JOB * jobs, unsigned int job_type, unsigned int msec);
97 static void hb_job_shutdown (HB_JOB * jobs);
98 
99 
100 /* cluster jobs */
101 static void hb_cluster_job_init (HB_JOB_ARG * arg);
102 static void hb_cluster_job_heartbeat (HB_JOB_ARG * arg);
103 static void hb_cluster_job_calc_score (HB_JOB_ARG * arg);
104 static void hb_cluster_job_failover (HB_JOB_ARG * arg);
105 static void hb_cluster_job_failback (HB_JOB_ARG * arg);
106 static void hb_cluster_job_check_ping (HB_JOB_ARG * arg);
108 static void hb_cluster_job_demote (HB_JOB_ARG * arg);
109 
110 static void hb_cluster_request_heartbeat_to_all (void);
111 static int hb_cluster_send_heartbeat_req (char *dest_host_name);
112 static int hb_cluster_send_heartbeat_resp (struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name);
113 static int hb_cluster_send_heartbeat_internal (struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name,
114  bool is_req);
115 
116 static void hb_cluster_receive_heartbeat (char *buffer, int len, struct sockaddr_in *from, socklen_t from_len);
117 static bool hb_cluster_is_isolated (void);
119 static bool hb_cluster_check_valid_ping_server (void);
120 
121 static int hb_cluster_calc_score (void);
122 
123 static int hb_set_net_header (HBP_HEADER * header, unsigned char type, bool is_req, unsigned short len,
124  unsigned int seq, char *dest_host_name);
125 static int hb_hostname_to_sin_addr (const char *host, struct in_addr *addr);
126 static int hb_hostname_n_port_to_sockaddr (const char *host, int port, struct sockaddr *saddr, socklen_t * slen);
127 
128 /* common */
129 static int hb_check_ping (const char *host);
130 
131 /* cluster jobs queue */
132 static HB_JOB_ENTRY *hb_cluster_job_dequeue (void);
133 static int hb_cluster_job_queue (unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec);
134 static int hb_cluster_job_set_expire_and_reorder (unsigned int job_type, unsigned int msec);
135 static void hb_cluster_job_shutdown (void);
136 
137 /* cluster node */
138 static HB_NODE_ENTRY *hb_add_node_to_cluster (char *host_name, unsigned short priority);
139 static void hb_remove_node (HB_NODE_ENTRY * entry_p);
140 static void hb_cluster_remove_all_nodes (HB_NODE_ENTRY * first);
141 static HB_NODE_ENTRY *hb_return_node_by_name (char *name);
143 
144 static HB_UI_NODE_ENTRY *hb_return_ui_node (char *host_name, char *group_id, struct sockaddr_in saddr);
145 static HB_UI_NODE_ENTRY *hb_add_ui_node (char *host_name, char *group_id, struct sockaddr_in saddr, int state);
146 static void hb_remove_ui_node (HB_UI_NODE_ENTRY * node);
147 static void hb_cleanup_ui_nodes (HB_UI_NODE_ENTRY * first);
149 
150 static int hb_is_heartbeat_valid (char *host_name, char *group_id, struct sockaddr_in *from);
151 static const char *hb_valid_result_string (int v_result);
152 
153 static int hb_cluster_load_group_and_node_list (char *ha_node_list, char *ha_replica_list);
154 
155 /* ping host related functions */
156 static HB_PING_HOST_ENTRY *hb_add_ping_host (char *host_name);
157 static void hb_remove_ping_host (HB_PING_HOST_ENTRY * entry_p);
159 
160 /* resource jobs */
161 static void hb_resource_job_proc_start (HB_JOB_ARG * arg);
162 static void hb_resource_job_proc_dereg (HB_JOB_ARG * arg);
163 static void hb_resource_job_confirm_start (HB_JOB_ARG * arg);
164 static void hb_resource_job_confirm_dereg (HB_JOB_ARG * arg);
165 static void hb_resource_job_change_mode (HB_JOB_ARG * arg);
168 static void hb_resource_job_cleanup_all (HB_JOB_ARG * arg);
170 
173 static void hb_resource_demote_kill_server_proc (void);
174 
175 /* resource job queue */
177 static int hb_resource_job_queue (unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec);
178 static int hb_resource_job_set_expire_and_reorder (unsigned int job_type, unsigned int msec);
179 
180 static void hb_resource_job_shutdown (void);
181 
182 /* resource process */
183 static HB_PROC_ENTRY *hb_alloc_new_proc (void);
184 static void hb_remove_proc (HB_PROC_ENTRY * entry_p);
185 static void hb_remove_all_procs (HB_PROC_ENTRY * first);
186 
187 static HB_PROC_ENTRY *hb_return_proc_by_args (char *args);
189 static HB_PROC_ENTRY *hb_return_proc_by_fd (int sfd);
190 static void hb_proc_make_arg (char **arg, char *args);
192 #if defined (ENABLE_UNUSED_FUNCTION)
193 static void hb_deregister_nodes (char *node_to_dereg);
194 #endif /* ENABLE_UNUSED_FUNCTION */
195 
196 /* resource process connection */
197 static int hb_resource_send_changemode (HB_PROC_ENTRY * proc);
198 static void hb_resource_send_get_eof (void);
199 static bool hb_resource_check_server_log_grow (void);
200 
201 /* cluster/resource threads */
202 #if defined(WINDOW)
203 static unsigned __stdcall hb_thread_cluster_worker (void *arg);
204 static unsigned __stdcall hb_thread_cluster_reader (void *arg);
205 static unsigned __stdcall hb_thread_resource_worker (void *arg);
206 static unsigned __stdcall hb_thread_check_disk_failure (void *arg);
207 #else
208 static void *hb_thread_cluster_worker (void *arg);
209 static void *hb_thread_cluster_reader (void *arg);
210 static void *hb_thread_resource_worker (void *arg);
211 static void *hb_thread_check_disk_failure (void *arg);
212 #endif
213 
214 
215 /* initializer */
216 static int hb_cluster_initialize (const char *nodes, const char *replicas);
217 static int hb_cluster_job_initialize (void);
218 static int hb_resource_initialize (void);
219 static int hb_resource_job_initialize (void);
220 static int hb_thread_initialize (void);
221 
222 /* terminator */
223 static void hb_resource_cleanup (void);
224 static void hb_resource_shutdown_all_ha_procs (void);
225 static void hb_cluster_cleanup (void);
226 static void hb_kill_process (pid_t * pids, int count);
227 
228 /* process command */
229 static const char *hb_node_state_string (int nstate);
230 static const char *hb_process_state_string (unsigned char ptype, int pstate);
231 static const char *hb_ping_result_string (int ping_result);
232 
233 static int hb_reload_config (void);
234 
235 static int hb_help_sprint_processes_info (char *buffer, int max_length);
236 static int hb_help_sprint_nodes_info (char *buffer, int max_length);
237 static int hb_help_sprint_jobs_info (HB_JOB * jobs, char *buffer, int max_length);
238 static int hb_help_sprint_ping_host_info (char *buffer, int max_length);
239 
245 
246 static char hb_Nolog_event_msg[LINE_MAX] = "";
247 
249 
250 static bool hb_Is_activated = true;
251 
252 /* cluster jobs */
262  NULL
263 };
264 
265 /* resource jobs */
276  NULL
277 };
278 
279 #define HA_NODE_INFO_FORMAT_STRING \
280  " HA-Node Info (current %s, state %s)\n"
281 #define HA_NODE_FORMAT_STRING \
282  " Node %s (priority %d, state %s)\n"
283 #define HA_UI_NODE_FORMAT_STRING \
284  " * Node %s (ip %s, group %s, state %s)\n"
285 #define HA_NODE_SCORE_FORMAT_STRING \
286  " - score %d\n"
287 #define HA_NODE_HEARTBEAT_GAP_FORMAT_STRING \
288  " - missed heartbeat %d\n"
289 
290 #define HA_PROCESS_INFO_FORMAT_STRING \
291  " HA-Process Info (master %d, state %s)\n"
292 #define HA_SERVER_PROCESS_FORMAT_STRING \
293  " Server %s (pid %d, state %s)\n"
294 #define HA_COPYLOG_PROCESS_FORMAT_STRING \
295  " Copylogdb %s (pid %d, state %s)\n"
296 #define HA_APPLYLOG_PROCESS_FORMAT_STRING \
297  " Applylogdb %s (pid %d, state %s)\n"
298 #define HA_PROCESS_EXEC_PATH_FORMAT_STRING \
299  " - exec-path [%s] \n"
300 #define HA_PROCESS_ARGV_FORMAT_STRING \
301  " - argv [%s] \n"
302 #define HA_PROCESS_REGISTER_TIME_FORMAT_STRING \
303  " - registered-time %s\n"
304 #define HA_PROCESS_DEREGISTER_TIME_FORMAT_STRING \
305  " - deregistered-time %s\n"
306 #define HA_PROCESS_SHUTDOWN_TIME_FORMAT_STRING \
307  " - shutdown-time %s\n"
308 #define HA_PROCESS_START_TIME_FORMAT_STRING \
309  " - start-time %s\n"
310 
311 #define HA_PING_HOSTS_INFO_FORMAT_STRING \
312  " HA-Ping Host Info (PING check %s)\n"
313 #define HA_PING_HOSTS_FORMAT_STRING \
314  " %-20s %s\n"
315 
316 #define HA_ADMIN_INFO_FORMAT_STRING \
317  " HA-Admin Info\n"
318 #define HA_ADMIN_INFO_NOLOG_FORMAT_STRING \
319  " Error Logging: disabled\n"
320 #define HA_ADMIN_INFO_NOLOG_EVENT_FORMAT_STRING \
321  " %s\n"
322 /*
323  * linked list
324  */
325 /*
326  * hb_list_add() -
327  * return: none
328  *
329  * prev(in):
330  * entry(in/out):
331  */
332 static void
334 {
335  n->next = *(p);
336  if (n->next)
337  {
338  n->next->prev = &(n->next);
339  }
340  n->prev = p;
341  *(p) = n;
342 }
343 
344 /*
345  * hb_list_remove() -
346  * return: none
347  * entry(in):
348  */
349 static void
351 {
352  if (n->prev)
353  {
354  *(n->prev) = n->next;
355  if (*(n->prev))
356  {
357  n->next->prev = n->prev;
358  }
359  }
360  n->next = NULL;
361  n->prev = NULL;
362 }
363 
364 /*
365  * hb_list_move() -
366  * return: none
367  * dest_pp(in):
368  * source_pp(in):
369  */
370 static void
371 hb_list_move (HB_LIST ** dest_pp, HB_LIST ** source_pp)
372 {
373  *dest_pp = *source_pp;
374  if (*dest_pp)
375  {
376  (*dest_pp)->prev = dest_pp;
377  }
378 
379  *source_pp = NULL;
380 }
381 
382 /*
383  * job common
384  */
385 
386 /*
387  * hb_add_timeval() -
388  *
389  * return: none
390  * tv_p(in/out):
391  * msec(in):
392  */
393 static void
394 hb_add_timeval (struct timeval *tv_p, unsigned int msec)
395 {
396  if (tv_p == NULL)
397  {
398  return;
399  }
400 
401  tv_p->tv_sec += (msec / 1000);
402  tv_p->tv_usec += ((msec % 1000) * 1000);
403 }
404 
405 /*
406  * hb_compare_timeval() -
407  * return: (1) if arg1 > arg2
408  * (0) if arg1 = arg2
409  * (-1) if arg1 < arg2
410  *
411  * arg1(in):
412  * arg2(in):
413  */
414 static int
415 hb_compare_timeval (struct timeval *arg1, struct timeval *arg2)
416 {
417  if (arg1 == NULL && arg2 == NULL)
418  {
419  return 0;
420  }
421  if (arg1 == NULL)
422  {
423  return -1;
424  }
425  if (arg2 == NULL)
426  {
427  return 1;
428  }
429 
430  if (arg1->tv_sec > arg2->tv_sec)
431  {
432  return 1;
433  }
434  else if (arg1->tv_sec == arg2->tv_sec)
435  {
436  if (arg1->tv_usec > arg2->tv_usec)
437  {
438  return 1;
439  }
440  else if (arg1->tv_usec == arg2->tv_usec)
441  {
442  return 0;
443  }
444  else
445  {
446  return -1;
447  }
448  }
449  else
450  {
451  return -1;
452  }
453 }
454 
455 /*
456  * hb_strtime() -
457  *
458  * return: time string
459  * s(in):
460  * max(in):
461  * tv_p(in):
462  */
463 static const char *
464 hb_strtime (char *s, unsigned int max, struct timeval *tv_p)
465 {
466  struct tm hb_tm, *hb_tm_p = &hb_tm;
467 
468  if (s == NULL || max < 24 || tv_p == NULL || tv_p->tv_sec == 0)
469  {
470  goto error_return;
471  }
472  *s = '\0';
473 
474  hb_tm_p = localtime_r (&tv_p->tv_sec, &hb_tm);
475 
476  if (hb_tm_p == NULL)
477  {
478  goto error_return;
479  }
480 
481  snprintf (s + strftime (s, (max - 5), "%m/%d/%y %H:%M:%S", hb_tm_p), (max - 1), ".%03ld", tv_p->tv_usec / 1000);
482  s[max - 1] = '\0';
483  return (const char *) s;
484 
485 error_return:
486  return (const char *) "00/00/00 00:00:00.000";
487 }
488 
489 /*
490  * hb_job_queue() - enqueue a job to the queue sorted by expire time
491  * return: NO_ERROR or ER_FAILED
492  *
493  * jobs(in):
494  * job_type(in):
495  * arg(in):
496  * msec(in):
497  */
498 static int
499 hb_job_queue (HB_JOB * jobs, unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec)
500 {
501  HB_JOB_ENTRY **job;
502  HB_JOB_ENTRY *new_job;
503  struct timeval now;
504  int rv;
505 
506  new_job = (HB_JOB_ENTRY *) malloc (sizeof (HB_JOB_ENTRY));
507  if (new_job == NULL)
508  {
511  }
512 
513  gettimeofday (&now, NULL);
514  hb_add_timeval (&now, msec);
515 
516  new_job->prev = NULL;
517  new_job->next = NULL;
518  new_job->type = job_type;
519  new_job->func = jobs->job_funcs[job_type];
520  new_job->arg = arg;
521  memcpy ((void *) &(new_job->expire), (void *) &now, sizeof (struct timeval));
522 
523  rv = pthread_mutex_lock (&jobs->lock);
524  for (job = &(jobs->jobs); *job; job = &((*job)->next))
525  {
526  /*
527  * compare expire time of new job and current job
528  * until new job's expire is larger than current's
529  */
530  if (hb_compare_timeval (&((*job)->expire), &now) <= 0)
531  {
532  continue;
533  }
534  break;
535  }
536  hb_list_add ((HB_LIST **) job, (HB_LIST *) new_job);
537 
538  pthread_mutex_unlock (&jobs->lock);
539  return NO_ERROR;
540 }
541 
542 /*
543  * hb_job_dequeue() - dequeue a job from queue expiration time
544  * is smaller than current time
545  * return: pointer to heartbeat job entry
546  *
547  * jobs(in):
548  */
549 static HB_JOB_ENTRY *
551 {
552  struct timeval now;
553  HB_JOB_ENTRY *job;
554  int rv;
555 
556  gettimeofday (&now, NULL);
557 
558  rv = pthread_mutex_lock (&jobs->lock);
559  if (jobs->shutdown == true)
560  {
561  pthread_mutex_unlock (&jobs->lock);
562  return NULL;
563  }
564 
565  job = jobs->jobs;
566  if (job == NULL)
567  {
568  pthread_mutex_unlock (&jobs->lock);
569  return NULL;
570  }
571 
572  if (hb_compare_timeval (&now, &job->expire) >= 0)
573  {
574  hb_list_remove ((HB_LIST *) job);
575  }
576  else
577  {
578  pthread_mutex_unlock (&jobs->lock);
579  return NULL;
580  }
581  pthread_mutex_unlock (&jobs->lock);
582 
583  return job;
584 }
585 
586 /*
587  * hb_job_set_expire_and_reorder - set expiration time of the first job which match job_type
588  * reorder job with expiration time changed
589  * return: none
590  *
591  * jobs(in):
592  * job_type(in):
593  * msec(in):
594  */
595 static void
596 hb_job_set_expire_and_reorder (HB_JOB * jobs, unsigned int job_type, unsigned int msec)
597 {
598  HB_JOB_ENTRY **job = NULL;
599  HB_JOB_ENTRY *target_job = NULL;
600  struct timeval now;
601 
602  gettimeofday (&now, NULL);
603  hb_add_timeval (&now, msec);
604 
605  pthread_mutex_lock (&jobs->lock);
606 
607  if (jobs->shutdown == true)
608  {
609  pthread_mutex_unlock (&jobs->lock);
610  return;
611  }
612 
613  for (job = &(jobs->jobs); *job; job = &((*job)->next))
614  {
615  if ((*job)->type == job_type)
616  {
617  target_job = *job;
618  break;
619  }
620  }
621 
622  if (target_job == NULL)
623  {
624  pthread_mutex_unlock (&jobs->lock);
625  return;
626  }
627 
628  memcpy ((void *) &(target_job->expire), (void *) &now, sizeof (struct timeval));
629 
630  /*
631  * so now we change target job's turn to adjust sorted queue
632  */
633  hb_list_remove ((HB_LIST *) target_job);
634 
635  for (job = &(jobs->jobs); *job; job = &((*job)->next))
636  {
637  /*
638  * compare expiration time of target job and current job
639  * until target job's expire is larger than current's
640  */
641  if (hb_compare_timeval (&((*job)->expire), &(target_job->expire)) > 0)
642  {
643  break;
644  }
645  }
646 
647  hb_list_add ((HB_LIST **) job, (HB_LIST *) target_job);
648 
649  pthread_mutex_unlock (&jobs->lock);
650 
651  return;
652 }
653 
654 /*
655  * hb_job_shutdown() - clear job queue and stop job worker thread
656  * return: none
657  *
658  * jobs(in):
659  */
660 static void
662 {
663  int rv;
664  HB_JOB_ENTRY *job, *job_next;
665 
666  rv = pthread_mutex_lock (&jobs->lock);
667  for (job = jobs->jobs; job; job = job_next)
668  {
669  job_next = job->next;
670 
671  hb_list_remove ((HB_LIST *) job);
672  free_and_init (job);
673  }
674  jobs->shutdown = true;
675  pthread_mutex_unlock (&jobs->lock);
676 }
677 
678 
679 /*
680  *cluster node job actions
681  */
682 
683 /*
684  * hb_cluster_job_init() -
685  * return: none
686  *
687  * arg(in):
688  */
689 static void
691 {
692  int error;
693 
695  assert (error == NO_ERROR);
696 
698  assert (error == NO_ERROR);
699 
701  assert (error == NO_ERROR);
702 
703  if (arg)
704  {
705  free_and_init (arg);
706  }
707 }
708 
709 /*
710  * hb_cluster_job_heartbeat() - send heartbeat request to other nodes
711  * return: none
712  *
713  * jobs(in):
714  */
715 static void
717 {
718  int error, rv;
719 
720  rv = pthread_mutex_lock (&hb_Cluster->lock);
721 
722  if (hb_Cluster->hide_to_demote == false)
723  {
725  }
726 
727  pthread_mutex_unlock (&hb_Cluster->lock);
729  assert (error == NO_ERROR);
730 
731  if (arg)
732  {
733  free_and_init (arg);
734  }
735  return;
736 }
737 
738 /*
739  * hb_cluster_is_isolated() -
740  * return: whether current node is isolated or not
741  *
742  */
743 static bool
745 {
746  HB_NODE_ENTRY *node;
747  for (node = hb_Cluster->nodes; node; node = node->next)
748  {
749  if (node->state == HB_NSTATE_REPLICA)
750  {
751  continue;
752  }
753 
754  if (hb_Cluster->myself != node && node->state != HB_NSTATE_UNKNOWN)
755  {
756  return false;
757  }
758  }
759  return true;
760 }
761 
762 /*
763  * hb_cluster_is_received_heartbeat_from_all() -
764  * return: whether current node received heartbeat from all node
765  */
766 static bool
768 {
769  HB_NODE_ENTRY *node;
770  struct timeval now;
771  unsigned int heartbeat_confirm_time;
772 
774 
775  gettimeofday (&now, NULL);
776 
777  for (node = hb_Cluster->nodes; node; node = node->next)
778  {
779  if (hb_Cluster->myself != node && HB_GET_ELAPSED_TIME (now, node->last_recv_hbtime) > heartbeat_confirm_time)
780  {
781  return false;
782  }
783  }
784  return true;
785 }
786 
787 /*
788  * hb_cluster_job_calc_score() -
789  * return: none
790  *
791  * jobs(in):
792  */
793 static void
795 {
796  int error, rv;
797  int num_master;
798  unsigned int failover_wait_time;
799  HB_JOB_ARG *job_arg;
800  HB_CLUSTER_JOB_ARG *clst_arg;
801  char hb_info_str[HB_INFO_STR_MAX];
802 
803  ENTER_FUNC ();
804 
805  rv = pthread_mutex_lock (&hb_Cluster->lock);
806 
807  num_master = hb_cluster_calc_score ();
808  hb_Cluster->is_isolated = hb_cluster_is_isolated ();
809 
810  if (hb_Cluster->state == HB_NSTATE_REPLICA || hb_Cluster->hide_to_demote == true)
811  {
812  goto calc_end;
813  }
814 
815  /* case : check whether master has been isolated */
816  if (hb_Cluster->state == HB_NSTATE_MASTER)
817  {
818  if (hb_Cluster->is_isolated == true)
819  {
820  /* check ping if Ping host exist */
821  pthread_mutex_unlock (&hb_Cluster->lock);
822 
823  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
824  if (job_arg)
825  {
826  clst_arg = &(job_arg->cluster_job_arg);
827  clst_arg->ping_check_count = 0;
828  clst_arg->retries = 0;
829 
831  assert (error == NO_ERROR);
832  }
833 
834  if (arg)
835  {
836  free_and_init (arg);
837  }
838 
839  return;
840  }
841  }
842 
843  /* case : split-brain */
844  if ((num_master > 1)
845  && (hb_Cluster->master && hb_Cluster->myself && hb_Cluster->myself->state == HB_NSTATE_MASTER
846  && hb_Cluster->master->priority != hb_Cluster->myself->priority))
847  {
849  "More than one master detected and failback will be initiated");
850 
853 
854  if (hb_Cluster->num_ping_hosts > 0)
855  {
858  }
859 
860  pthread_mutex_unlock (&hb_Cluster->lock);
861 
863  assert (error == NO_ERROR);
864 
865  if (arg)
866  {
867  free_and_init (arg);
868  }
869 
870  return;
871  }
872 
873  /* case : failover */
874  if ((hb_Cluster->state == HB_NSTATE_SLAVE)
875  && (hb_Cluster->master && hb_Cluster->myself && hb_Cluster->master->priority == hb_Cluster->myself->priority))
876  {
877  hb_Cluster->state = HB_NSTATE_TO_BE_MASTER;
879 
880  pthread_mutex_unlock (&hb_Cluster->lock);
881 
882  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
883  if (job_arg)
884  {
885  clst_arg = &(job_arg->cluster_job_arg);
886  clst_arg->ping_check_count = 0;
887 
889  assert (error == NO_ERROR);
890  }
891  else
892  {
894 
896  {
897  failover_wait_time = HB_JOB_TIMER_WAIT_500_MILLISECOND;
898  }
899  else
900  {
901  /* If current node didn't receive heartbeat from some node, wait for some time */
903  }
904 
905  error = hb_cluster_job_queue (HB_CJOB_FAILOVER, NULL, failover_wait_time);
906  assert (error == NO_ERROR);
907 
909  "A failover attempted to make the current node a master");
910  }
911 
912  if (arg)
913  {
914  free_and_init (arg);
915  }
916 
917  return;
918  }
919 
921  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "%s", hb_info_str);
922 
923 calc_end:
924  pthread_mutex_unlock (&hb_Cluster->lock);
925 
926  error =
928  assert (error == NO_ERROR);
929 
930  if (arg)
931  {
932  free_and_init (arg);
933  }
934 
935  return;
936 }
937 
938 /*
939  * hb_cluster_job_check_ping() -
940  * return: none
941  *
942  * jobs(in):
943  */
944 static void
946 {
947  int error, rv;
948  int ping_try_count = 0;
949  bool ping_success = false;
950  int ping_result;
951  unsigned int failover_wait_time;
952  HB_CLUSTER_JOB_ARG *clst_arg = (arg) ? &(arg->cluster_job_arg) : NULL;
953  HB_PING_HOST_ENTRY *ping_host;
954 
955  ENTER_FUNC ();
956 
957  rv = pthread_mutex_lock (&hb_Cluster->lock);
958 
959  if (clst_arg == NULL || hb_Cluster->num_ping_hosts == 0 || hb_Cluster->is_ping_check_enabled == false)
960  {
961  /* If Ping Host is either empty or marked invalid, MASTER->MASTER, SLAVE->MASTER. It may cause split-brain
962  * problem. */
963  if (hb_Cluster->state == HB_NSTATE_MASTER)
964  {
965  goto ping_check_cancel;
966  }
967  }
968  else
969  {
970  for (ping_host = hb_Cluster->ping_hosts; ping_host; ping_host = ping_host->next)
971  {
972  ping_result = hb_check_ping (ping_host->host_name);
973 
974  ping_host->ping_result = ping_result;
975  if (ping_result == HB_PING_SUCCESS)
976  {
977  ping_try_count++;
978  ping_success = true;
979  break;
980  }
981  else if (ping_result == HB_PING_FAILURE)
982  {
983  ping_try_count++;
984  }
985  }
986 
987  if (hb_Cluster->state == HB_NSTATE_MASTER)
988  {
989  if (ping_try_count == 0 || ping_success == true)
990  {
991  goto ping_check_cancel;
992  }
993  }
994  else
995  {
996  if (ping_try_count > 0 && ping_success == false)
997  {
998  goto ping_check_cancel;
999  }
1000  }
1001 
1002  if ((++clst_arg->ping_check_count) < HB_MAX_PING_CHECK)
1003  {
1004  /* Try ping test again */
1005  pthread_mutex_unlock (&hb_Cluster->lock);
1006 
1008  assert (error == NO_ERROR);
1009 
1010  return;
1011  }
1012  }
1013 
1014  /* Now, we have tried ping test over HB_MAX_PING_CHECK times. (or Slave's ping host is either empty or invalid.) So,
1015  * we can determine this node's next job (failover or failback). */
1016 
1018 
1019  pthread_mutex_unlock (&hb_Cluster->lock);
1020 
1021  if (hb_Cluster->state == HB_NSTATE_MASTER)
1022  {
1023  /* If this node is Master, do failback */
1025  assert (error == NO_ERROR);
1026  }
1027  else
1028  {
1029  /* If this node is Slave, do failover */
1031  {
1032  failover_wait_time = HB_JOB_TIMER_WAIT_500_MILLISECOND;
1033  }
1034  else
1035  {
1036  /* If current node didn't receive heartbeat from some node, wait for some time */
1038  }
1039  error = hb_cluster_job_queue (HB_CJOB_FAILOVER, NULL, failover_wait_time);
1040  assert (error == NO_ERROR);
1041  }
1042 
1043  if (arg)
1044  {
1045  free_and_init (arg);
1046  }
1047 
1048  EXIT_FUNC ();
1049 
1050  return;
1051 
1052 ping_check_cancel:
1053 /* if this node is a master, then failback is cancelled */
1054 
1055  if (hb_Cluster->state != HB_NSTATE_MASTER)
1056  {
1057  MASTER_ER_SET (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "Failover cancelled by ping check");
1058  hb_Cluster->state = HB_NSTATE_SLAVE;
1059  }
1061 
1062  pthread_mutex_unlock (&hb_Cluster->lock);
1063 
1064  /* do calc_score job again */
1065  error =
1067  assert (error == NO_ERROR);
1068 
1069  if (arg)
1070  {
1071  free_and_init (arg);
1072  }
1073 
1074  EXIT_FUNC ();
1075 
1076  return;
1077 }
1078 
1079 
1080 /*
1081  * hb_cluster_job_failover() -
1082  * return: none
1083  *
1084  * jobs(in):
1085  */
1086 static void
1088 {
1089  int error, rv;
1090  int num_master;
1091  char hb_info_str[HB_INFO_STR_MAX];
1092 
1093  ENTER_FUNC ();
1094 
1095  rv = pthread_mutex_lock (&hb_Cluster->lock);
1096 
1097  num_master = hb_cluster_calc_score ();
1098 
1099  if (hb_Cluster->master && hb_Cluster->myself && hb_Cluster->master->priority == hb_Cluster->myself->priority)
1100  {
1101  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "Failover completed");
1102  hb_Cluster->state = HB_NSTATE_MASTER;
1103  hb_Resource->state = HB_NSTATE_MASTER;
1104 
1106  assert (error == NO_ERROR);
1107  }
1108  else
1109  {
1110  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "Failover cancelled");
1111  hb_Cluster->state = HB_NSTATE_SLAVE;
1112  }
1113 
1116 
1117  if (hb_Cluster->num_ping_hosts > 0)
1118  {
1121  }
1122 
1124  pthread_mutex_unlock (&hb_Cluster->lock);
1125 
1126  error =
1128  assert (error == NO_ERROR);
1129 
1130  if (arg)
1131  {
1132  free_and_init (arg);
1133  }
1134  return;
1135 }
1136 
1137 /*
1138  * hb_cluster_job_demote() -
1139  * it waits for new master to be elected.
1140  * hb_resource_job_demote_start_shutdown must be proceeded
1141  * before this job.
1142  * return: none
1143  *
1144  * arg(in):
1145  */
1146 static void
1148 {
1149  int rv, error;
1150  HB_NODE_ENTRY *node;
1151  HB_CLUSTER_JOB_ARG *clst_arg = (arg) ? &(arg->cluster_job_arg) : NULL;
1152  char hb_info_str[HB_INFO_STR_MAX];
1153 
1154  ENTER_FUNC ();
1155 
1156  if (arg == NULL || clst_arg == NULL)
1157  {
1158  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. " "(arg:%p, proc_arg:%p). \n", arg, clst_arg);
1159  return;
1160  }
1161 
1162  rv = pthread_mutex_lock (&hb_Cluster->lock);
1163 
1164  if (clst_arg->retries == 0)
1165  {
1166  assert (hb_Cluster->state == HB_NSTATE_MASTER);
1167  assert (hb_Resource->state == HB_NSTATE_SLAVE);
1168 
1169  /* send state (HB_NSTATE_UNKNOWN) to other nodes for making other node be master */
1170  hb_Cluster->state = HB_NSTATE_UNKNOWN;
1172 
1174  "Waiting for a new node to be elected as master");
1175  }
1176 
1177  hb_Cluster->hide_to_demote = true;
1178  hb_Cluster->state = HB_NSTATE_SLAVE;
1179  hb_Cluster->myself->state = hb_Cluster->state;
1180 
1181  if (hb_Cluster->is_isolated == true || ++(clst_arg->retries) > HB_MAX_WAIT_FOR_NEW_MASTER)
1182  {
1184  "Failed to find a new master node and it changes " "its role back to master again");
1185  hb_Cluster->hide_to_demote = false;
1186 
1187  pthread_mutex_unlock (&hb_Cluster->lock);
1188 
1189  if (arg)
1190  {
1191  free_and_init (arg);
1192  }
1193  return;
1194  }
1195 
1196  for (node = hb_Cluster->nodes; node; node = node->next)
1197  {
1198  if (node->state == HB_NSTATE_MASTER)
1199  {
1200  assert (node != hb_Cluster->myself);
1201 
1202  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "Found a new master node");
1203 
1206 
1207  if (hb_Cluster->num_ping_hosts > 0)
1208  {
1211  }
1212 
1213  hb_Cluster->hide_to_demote = false;
1214 
1215  pthread_mutex_unlock (&hb_Cluster->lock);
1216 
1217  if (arg)
1218  {
1219  free_and_init (arg);
1220  }
1221  return;
1222  }
1223  }
1224 
1225  pthread_mutex_unlock (&hb_Cluster->lock);
1226 
1228 
1229  if (error != NO_ERROR)
1230  {
1231  assert (false);
1232  free_and_init (arg);
1233  }
1234  return;
1235 }
1236 
1237 /*
1238  * hb_cluster_job_failback () -
1239  * return: none
1240  *
1241  * jobs(in):
1242  *
1243  * NOTE: this job waits for servers to be killed.
1244  * Therefore, be aware that adding this job to queue might
1245  * temporarily prevent cluster_job_calc or any other cluster
1246  * jobs following this one from executing at regular intervals
1247  * as intended.
1248  */
1249 static void
1251 {
1252  int error, count = 0;
1253  char hb_info_str[HB_INFO_STR_MAX];
1254  HB_PROC_ENTRY *proc;
1255  pid_t *pids = NULL;
1256  size_t size;
1257  bool emergency_kill_enabled = false;
1258 
1259  ENTER_FUNC ();
1260 
1261  pthread_mutex_lock (&hb_Cluster->lock);
1262 
1263  hb_Cluster->state = HB_NSTATE_SLAVE;
1264  hb_Cluster->myself->state = hb_Cluster->state;
1265 
1267 
1269  "This master will become a slave and cub_server will be restarted");
1270 
1273 
1274  if (hb_Cluster->num_ping_hosts > 0)
1275  {
1278  }
1279 
1280  pthread_mutex_unlock (&hb_Cluster->lock);
1281 
1282  pthread_mutex_lock (&hb_Resource->lock);
1283  hb_Resource->state = HB_NSTATE_SLAVE;
1284 
1285  proc = hb_Resource->procs;
1286  while (proc)
1287  {
1288  if (proc->type != HB_PTYPE_SERVER)
1289  {
1290  proc = proc->next;
1291  continue;
1292  }
1293 
1294  if (emergency_kill_enabled == false)
1295  {
1296  size = sizeof (pid_t) * (count + 1);
1297  pids = (pid_t *) realloc (pids, size);
1298  if (pids == NULL)
1299  {
1301 
1302  /*
1303  * in case that memory allocation fails,
1304  * kill all cub_server processes with SIGKILL
1305  */
1306  emergency_kill_enabled = true;
1307  proc = hb_Resource->procs;
1308  continue;
1309  }
1310  pids[count++] = proc->pid;
1311  }
1312  else
1313  {
1314  assert (proc->pid > 0);
1315  if (proc->pid > 0)
1316  {
1317  kill (proc->pid, SIGKILL);
1318  }
1319  }
1320  proc = proc->next;
1321  }
1322 
1323  pthread_mutex_unlock (&hb_Resource->lock);
1324 
1325  if (emergency_kill_enabled == false)
1326  {
1327  hb_kill_process (pids, count);
1328  }
1329 
1330  if (pids)
1331  {
1332  free_and_init (pids);
1333  }
1334 
1335  error =
1337  assert (error == NO_ERROR);
1338 
1339  if (arg)
1340  {
1341  free_and_init (arg);
1342  }
1343  return;
1344 }
1345 
1346 /*
1347  * hb_cluster_check_valid_ping_server() -
1348  * return: whether a valid ping host exists or not
1349  *
1350  * NOTE: it returns true when no ping host is specified.
1351  */
1352 static bool
1354 {
1355  HB_PING_HOST_ENTRY *ping_host;
1356  bool valid_ping_host_exists = false;
1357 
1358  if (hb_Cluster->num_ping_hosts == 0)
1359  {
1360  return true;
1361  }
1362 
1363  for (ping_host = hb_Cluster->ping_hosts; ping_host; ping_host = ping_host->next)
1364  {
1365  ping_host->ping_result = hb_check_ping (ping_host->host_name);
1366 
1367  if (ping_host->ping_result == HB_PING_SUCCESS)
1368  {
1369  valid_ping_host_exists = true;
1370  }
1371  }
1372 
1373  return valid_ping_host_exists;
1374 }
1375 
1376 /*
1377  * hb_cluster_job_check_valid_ping_server() -
1378  * return: none
1379  *
1380  * jobs(in):
1381  */
1382 static void
1384 {
1385  int error, rv;
1386  bool valid_ping_host_exists;
1387  char buf[LINE_MAX];
1389 
1390  rv = pthread_mutex_lock (&hb_Cluster->lock);
1391 
1392  if (hb_Cluster->num_ping_hosts == 0)
1393  {
1394  goto check_end;
1395  }
1396 
1397  valid_ping_host_exists = hb_cluster_check_valid_ping_server ();
1398  if (valid_ping_host_exists == false && hb_cluster_is_isolated () == false)
1399  {
1401 
1402  if (hb_Cluster->is_ping_check_enabled == true)
1403  {
1404  hb_Cluster->is_ping_check_enabled = false;
1405  snprintf (buf, LINE_MAX,
1406  "Validity check for PING failed on all hosts " "and PING check is now temporarily disabled.");
1408  }
1409  }
1410  else if (valid_ping_host_exists == true)
1411  {
1412  if (hb_Cluster->is_ping_check_enabled == false)
1413  {
1414  hb_Cluster->is_ping_check_enabled = true;
1415  snprintf (buf, LINE_MAX, "Validity check for PING succeeded " "and PING check is now enabled.");
1417  }
1418  }
1419 
1420 check_end:
1421  pthread_mutex_unlock (&hb_Cluster->lock);
1422 
1423  error = hb_cluster_job_queue (HB_CJOB_CHECK_VALID_PING_SERVER, NULL, check_interval);
1424 
1425  assert (error == NO_ERROR);
1426 
1427  return;
1428 }
1429 
1430 /*
1431  * cluster common
1432  */
1433 
1434 /*
1435  * hb_cluster_calc_score() -
1436  * return: number of master nodes in heartbeat cluster
1437  */
1438 static int
1440 {
1441  int num_master = 0;
1442  short min_score = HB_NODE_SCORE_UNKNOWN;
1443  HB_NODE_ENTRY *node;
1444  struct timeval now;
1445 
1446  if (hb_Cluster == NULL)
1447  {
1448  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_Cluster is null. \n");
1449  return ER_FAILED;
1450  }
1451 
1452  hb_Cluster->myself->state = hb_Cluster->state;
1453  gettimeofday (&now, NULL);
1454 
1455  for (node = hb_Cluster->nodes; node; node = node->next)
1456  {
1457  /* If this node does not receive heartbeat message over than prm_get_integer_value (PRM_ID_HA_MAX_HEARTBEAT_GAP)
1458  * times, (or sufficient time has been elapsed from the last received heartbeat message time), this node does not
1459  * know what other node state is. */
1462  && HB_GET_ELAPSED_TIME (now,
1463  node->last_recv_hbtime) >
1465  {
1466  node->heartbeat_gap = 0;
1467  node->last_recv_hbtime.tv_sec = 0;
1468  node->last_recv_hbtime.tv_usec = 0;
1469  node->state = HB_NSTATE_UNKNOWN;
1470  }
1471 
1472  switch (node->state)
1473  {
1474  case HB_NSTATE_MASTER:
1475  case HB_NSTATE_TO_BE_SLAVE:
1476  {
1477  node->score = node->priority | HB_NODE_SCORE_MASTER;
1478  }
1479  break;
1481  {
1482  node->score = node->priority | HB_NODE_SCORE_TO_BE_MASTER;
1483  }
1484  break;
1485  case HB_NSTATE_SLAVE:
1486  {
1487  node->score = node->priority | HB_NODE_SCORE_SLAVE;
1488  }
1489  break;
1490  case HB_NSTATE_REPLICA:
1491  case HB_NSTATE_UNKNOWN:
1492  default:
1493  {
1494  node->score = node->priority | HB_NODE_SCORE_UNKNOWN;
1495  }
1496  break;
1497  }
1498 
1499  if (node->score < min_score)
1500  {
1501  hb_Cluster->master = node;
1502  min_score = node->score;
1503  }
1504 
1505  if (node->score < (short) HB_NODE_SCORE_TO_BE_MASTER)
1506  {
1507  num_master++;
1508  }
1509  }
1510 
1511  return num_master;
1512 }
1513 
1514 /*
1515  * hb_cluster_request_heartbeat_to_all() -
1516  * return: none
1517  *
1518  */
1519 static void
1521 {
1522  HB_NODE_ENTRY *node;
1523 
1524  if (hb_Cluster == NULL)
1525  {
1526  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_Cluster is null. \n");
1527  return;
1528  }
1529 
1530  for (node = hb_Cluster->nodes; node; node = node->next)
1531  {
1532  if (are_hostnames_equal (hb_Cluster->host_name, node->host_name))
1533  {
1534  continue;
1535  }
1536 
1538  node->heartbeat_gap++;
1539  }
1540 
1541  return;
1542 }
1543 
1544 /*
1545  * hb_cluster_send_heartbeat_req() -
1546  * return: none
1547  *
1548  * host_name(in):
1549  */
1550 static int
1551 hb_cluster_send_heartbeat_req (char *dest_host_name)
1552 {
1553  struct sockaddr_in saddr;
1554  socklen_t saddr_len;
1555 
1556  /* construct destination address */
1557  memset ((void *) &saddr, 0, sizeof (saddr));
1558  int error_code = hb_hostname_n_port_to_sockaddr (dest_host_name, prm_get_integer_value (PRM_ID_HA_PORT_ID),
1559  (struct sockaddr *) &saddr, &saddr_len);
1560  if (error_code != NO_ERROR)
1561  {
1562  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_hostname_n_port_to_sockaddr failed. \n");
1563  return error_code;
1564  }
1565 
1566  return hb_cluster_send_heartbeat_internal (&saddr, saddr_len, dest_host_name, true);
1567 }
1568 
1569 static int
1570 hb_cluster_send_heartbeat_resp (struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name)
1571 {
1572  return hb_cluster_send_heartbeat_internal (saddr, saddr_len, dest_host_name, false);
1573 }
1574 
1575 static int
1576 hb_cluster_send_heartbeat_internal (struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name, bool is_req)
1577 {
1579  char buffer[HB_BUFFER_SZ], *p;
1580  size_t hb_len;
1581  int send_len;
1582 
1583  memset ((void *) buffer, 0, sizeof (buffer));
1584  hbp_header = (HBP_HEADER *) (&buffer[0]);
1585 
1586  int error_code = hb_set_net_header (hbp_header, HBP_CLUSTER_HEARTBEAT, is_req, OR_INT_SIZE, 0, dest_host_name);
1587  if (error_code != NO_ERROR)
1588  {
1589  return error_code;
1590  }
1591 
1592  p = (char *) (hbp_header + 1);
1593  p = or_pack_int (p, hb_Cluster->state);
1594 
1595  hb_len = sizeof (HBP_HEADER) + OR_INT_SIZE;
1596 
1597  if (hb_Cluster->sfd == INVALID_SOCKET)
1598  {
1601  }
1602 
1603  send_len = sendto (hb_Cluster->sfd, (void *) &buffer[0], hb_len, 0, (struct sockaddr *) saddr, saddr_len);
1604  if (send_len <= 0)
1605  {
1606  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "sendto failed. \n");
1607  return ER_FAILED;
1608  }
1609 
1610  return NO_ERROR;
1611 }
1612 
1613 
1614 /*
1615  * hb_cluster_receive_heartbeat() -
1616  * return: none
1617  *
1618  * buffer(in):
1619  * len(in):
1620  * from(in):
1621  * from_len(in):
1622  */
1623 static void
1624 hb_cluster_receive_heartbeat (char *buffer, int len, struct sockaddr_in *from, socklen_t from_len)
1625 {
1626  int rv;
1628  HB_NODE_ENTRY *node;
1629  HB_UI_NODE_ENTRY *ui_node;
1630  char error_string[LINE_MAX] = "";
1631  char *p;
1632 
1633  int state = 0; /* HB_NODE_STATE_TYPE */
1634  bool is_state_changed = false;
1635 
1636  hbp_header = (HBP_HEADER *) (buffer);
1637 
1638  rv = pthread_mutex_lock (&hb_Cluster->lock);
1639  if (hb_Cluster->shutdown)
1640  {
1641  pthread_mutex_unlock (&hb_Cluster->lock);
1642  return;
1643  }
1644 
1645  /* validate receive message */
1646  if (!are_hostnames_equal (hb_Cluster->host_name, hbp_header->dest_host_name))
1647  {
1648  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hostname mismatch. " "(host_name:{%s}, dest_host_name:{%s}).\n",
1649  hb_Cluster->host_name, hbp_header->dest_host_name);
1650  pthread_mutex_unlock (&hb_Cluster->lock);
1651  return;
1652  }
1653 
1654  if (len != (int) (sizeof (*hbp_header) + htons (hbp_header->len)))
1655  {
1656  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "size mismatch. " "(len:%d, msg_size:%d).\n", len,
1657  (sizeof (*hbp_header) + htons (hbp_header->len)));
1658  pthread_mutex_unlock (&hb_Cluster->lock);
1659  return;
1660  }
1661 
1662 #if 0
1664  "hbp_header. (type:%d, r:%d, len:%d, seq:%d, " "orig_host_name:{%s}, dest_host_name:{%s}). \n",
1665  hbp_header->type, (hbp_header->r) ? 1 : 0, ntohs (hbp_header->len), ntohl (hbp_header->seq),
1666  hbp_header->orig_host_name, hbp_header->dest_host_name);
1667 #endif
1668 
1669  switch (hbp_header->type)
1670  {
1671  case HBP_CLUSTER_HEARTBEAT:
1672  {
1673  HB_NODE_STATE_TYPE hb_state;
1674 
1675  p = (char *) (hbp_header + 1);
1676  or_unpack_int (p, &state);
1677 
1678  hb_state = (HB_NODE_STATE_TYPE) state;
1679 
1680  if (hb_state < HB_NSTATE_UNKNOWN || hb_state >= HB_NSTATE_MAX)
1681  {
1682  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "receive heartbeat have unknown state. " "(state:%u).\n", state);
1683  pthread_mutex_unlock (&hb_Cluster->lock);
1684  return;
1685  }
1686 
1687  rv = hb_is_heartbeat_valid (hbp_header->orig_host_name, hbp_header->group_id, from);
1688  if (rv != HB_VALID_NO_ERROR)
1689  {
1690  ui_node = hb_return_ui_node (hbp_header->orig_host_name, hbp_header->group_id, *from);
1691 
1692  if (ui_node && ui_node->v_result != rv)
1693  {
1694  hb_remove_ui_node (ui_node);
1695  ui_node = NULL;
1696  }
1697 
1698  if (ui_node == NULL)
1699  {
1700  char *ipv4_p;
1701 
1702  ipv4_p = (char *) &from->sin_addr.s_addr;
1703  snprintf (error_string, sizeof (error_string),
1704  "Receive heartbeat from unidentified host. " "(host_name:'%s', group:'%s', "
1705  "ip_addr:'%u.%u.%u.%u', state:'%s')", hbp_header->orig_host_name, hbp_header->group_id,
1706  (unsigned char) (ipv4_p[0]), (unsigned char) (ipv4_p[1]), (unsigned char) (ipv4_p[2]),
1707  (unsigned char) (ipv4_p[3]), hb_valid_result_string (rv));
1709 
1710  (void) hb_add_ui_node (hbp_header->orig_host_name, hbp_header->group_id, *from, rv);
1711  }
1712  else
1713  {
1714  gettimeofday (&ui_node->last_recv_time, NULL);
1715  }
1716  }
1717 
1718  /*
1719  * if heartbeat group id is mismatch, ignore heartbeat
1720  */
1721  if (strcmp (hbp_header->group_id, hb_Cluster->group_id))
1722  {
1723  pthread_mutex_unlock (&hb_Cluster->lock);
1724  return;
1725  }
1726 
1727  /*
1728  * must send heartbeat response in order to avoid split-brain
1729  * when heartbeat configuration changed
1730  */
1731  if (hbp_header->r && hb_Cluster->hide_to_demote == false)
1732  {
1733  hb_cluster_send_heartbeat_resp (from, from_len, hbp_header->orig_host_name);
1734  }
1735 
1736  node = hb_return_node_by_name_except_me (hbp_header->orig_host_name);
1737  if (node)
1738  {
1739  if (node->state == HB_NSTATE_MASTER && node->state != hb_state)
1740  {
1741  is_state_changed = true;
1742  }
1743 
1744  node->state = hb_state;
1745  node->heartbeat_gap = MAX (0, (node->heartbeat_gap - 1));
1746  gettimeofday (&node->last_recv_hbtime, NULL);
1747  }
1748  else
1749  {
1750  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "receive heartbeat have unknown host_name. " "(host_name:{%s}).\n",
1751  hbp_header->orig_host_name);
1752  }
1753  }
1754  break;
1755  default:
1756  {
1757  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unknown heartbeat message. " "(type:%d). \n", hbp_header->type);
1758  }
1759  break;
1760 
1761  }
1762 
1763  pthread_mutex_unlock (&hb_Cluster->lock);
1764 
1765  if (is_state_changed == true)
1766  {
1767  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "peer node state has been changed.");
1769  }
1770 
1771  return;
1772 }
1773 
1774 /*
1775  * hb_set_net_header() -
1776  * return: none
1777  *
1778  * header(out):
1779  * type(in):
1780  * is_req(in):
1781  * len(in):
1782  * seq(in):
1783  * dest_host_name(in):
1784  */
1785 static int
1786 hb_set_net_header (HBP_HEADER * header, unsigned char type, bool is_req, unsigned short len, unsigned int seq,
1787  char *dest_host_name)
1788 {
1789  if (hb_Cluster->myself == NULL)
1790  {
1791  // if myself is NULL then cluster is not healthy
1792  return ER_FAILED;
1793  }
1794 
1795  header->type = type;
1796  header->r = (is_req) ? 1 : 0;
1797  header->len = htons (len);
1798  header->seq = htonl (seq);
1799  strncpy_bufsize (header->group_id, hb_Cluster->group_id);
1800  strncpy_bufsize (header->dest_host_name, dest_host_name);
1801  strncpy_bufsize (header->orig_host_name, hb_Cluster->myself->host_name);
1802 
1803  return NO_ERROR;
1804 }
1805 
1806 /*
1807  * hb_hostname_to_sin_addr() -
1808  * return:
1809  *
1810  * host(in):
1811  * addr(out):
1812  */
1813 static int
1814 hb_hostname_to_sin_addr (const char *host, struct in_addr *addr)
1815 {
1816  in_addr_t in_addr;
1817 
1818  /*
1819  * First try to convert to the host name as a dotten-decimal number.
1820  * Only if that fails do we call gethostbyname.
1821  */
1822  in_addr = inet_addr (host);
1823  if (in_addr != INADDR_NONE)
1824  {
1825  memcpy ((void *) addr, (void *) &in_addr, sizeof (in_addr));
1826  }
1827  else
1828  {
1829 #ifdef HAVE_GETHOSTBYNAME_R
1830 #if defined (HAVE_GETHOSTBYNAME_R_GLIBC)
1831  struct hostent *hp, hent;
1832  int herr;
1833  char buf[1024];
1834 
1835  if (gethostbyname_r (host, &hent, buf, sizeof (buf), &hp, &herr) != 0 || hp == NULL)
1836  {
1839  }
1840  memcpy ((void *) addr, (void *) hent.h_addr, hent.h_length);
1841 #elif defined (HAVE_GETHOSTBYNAME_R_SOLARIS)
1842  struct hostent hent;
1843  int herr;
1844  char buf[1024];
1845 
1846  if (gethostbyname_r (host, &hent, buf, sizeof (buf), &herr) == NULL)
1847  {
1850  }
1851  memcpy ((void *) addr, (void *) hent.h_addr, hent.h_length);
1852 #elif defined (HAVE_GETHOSTBYNAME_R_HOSTENT_DATA)
1853  struct hostent hent;
1854  struct hostent_data ht_data;
1855 
1856  if (gethostbyname_r (host, &hent, &ht_data) == -1)
1857  {
1860  }
1861  memcpy ((void *) addr, (void *) hent.h_addr, hent.h_length);
1862 #else
1863 #error "HAVE_GETHOSTBYNAME_R"
1864 #endif
1865 #else /* HAVE_GETHOSTBYNAME_R */
1866  struct hostent *hp;
1867  int r;
1868 
1870  hp = gethostbyname (host);
1871  if (hp == NULL)
1872  {
1876  }
1877  memcpy ((void *) addr, (void *) hp->h_addr, hp->h_length);
1879 #endif /* !HAVE_GETHOSTBYNAME_R */
1880  }
1881 
1882  return NO_ERROR;
1883 }
1884 
1885 
1886 /*
1887  * hb_hostname_n_port_to_sockaddr() -
1888  * return: NO_ERROR
1889  *
1890  * host(in):
1891  * port(in):
1892  * saddr(out):
1893  * slen(out):
1894  */
1895 static int
1896 hb_hostname_n_port_to_sockaddr (const char *host, int port, struct sockaddr *saddr, socklen_t * slen)
1897 {
1898  int error = NO_ERROR;
1899  struct sockaddr_in udp_saddr;
1900 
1901  /*
1902  * Construct address for UDP socket
1903  */
1904  memset ((void *) &udp_saddr, 0, sizeof (udp_saddr));
1905  udp_saddr.sin_family = AF_INET;
1906  udp_saddr.sin_port = htons (port);
1907 
1908  error = hb_hostname_to_sin_addr (host, &udp_saddr.sin_addr);
1909  if (error != NO_ERROR)
1910  {
1911  return INVALID_SOCKET;
1912  }
1913 
1914  *slen = sizeof (udp_saddr);
1915  memcpy ((void *) saddr, (void *) &udp_saddr, *slen);
1916 
1917  return NO_ERROR;
1918 }
1919 
1920 /*
1921  * cluster job queue
1922  */
1923 
1924 /*
1925  * hb_cluster_job_dequeue() -
1926  * return: pointer to cluster job entry
1927  */
1928 static HB_JOB_ENTRY *
1930 {
1931  return hb_job_dequeue (cluster_Jobs);
1932 }
1933 
1934 /*
1935  * hb_cluster_job_queue() -
1936  * return: NO_ERROR or ER_FAILED
1937  *
1938  * job_type(in):
1939  * arg(in):
1940  * msec(in):
1941  */
1942 static int
1943 hb_cluster_job_queue (unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec)
1944 {
1945  if (job_type >= HB_CJOB_MAX)
1946  {
1947  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unknown job type. (job_type:%d).\n", job_type);
1948  return ER_FAILED;
1949  }
1950 
1951  return hb_job_queue (cluster_Jobs, job_type, arg, msec);
1952 }
1953 
1954 /*
1955  * hb_cluster_job_set_expire_and_reorder() -
1956  * return: NO_ERROR or ER_FAILED
1957  *
1958  * job_type(in):
1959  * msec(in):
1960  */
1961 static int
1962 hb_cluster_job_set_expire_and_reorder (unsigned int job_type, unsigned int msec)
1963 {
1964  if (job_type >= HB_CJOB_MAX)
1965  {
1966  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unknown job type. (job_type:%d).\n", job_type);
1967  return ER_FAILED;
1968  }
1969 
1970  hb_job_set_expire_and_reorder (cluster_Jobs, job_type, msec);
1971 
1972  return NO_ERROR;
1973 }
1974 
1975 /*
1976  * hb_cluster_job_shutdown() -
1977  * return: pointer to cluster job entry
1978  */
1979 static void
1981 {
1982  return hb_job_shutdown (cluster_Jobs);
1983 }
1984 
1985 
1986 /*
1987  * cluster node
1988  */
1989 
1990 /*
1991  * hb_add_node_to_cluster() -
1992  * return: pointer to heartbeat node entry
1993  *
1994  * host_name(in):
1995  * priority(in):
1996  */
1997 static HB_NODE_ENTRY *
1998 hb_add_node_to_cluster (char *host_name, unsigned short priority)
1999 {
2000  HB_NODE_ENTRY *p;
2001  HB_NODE_ENTRY **first_pp;
2002 
2003  if (host_name == NULL)
2004  {
2005  return NULL;
2006  }
2007 
2008  p = (HB_NODE_ENTRY *) malloc (sizeof (HB_NODE_ENTRY));
2009  if (p)
2010  {
2011  if (are_hostnames_equal (host_name, "localhost"))
2012  {
2013  strncpy (p->host_name, hb_Cluster->host_name, sizeof (p->host_name) - 1);
2014  }
2015  else
2016  {
2017  strncpy (p->host_name, host_name, sizeof (p->host_name) - 1);
2018  }
2019  p->host_name[sizeof (p->host_name) - 1] = '\0';
2020  p->priority = priority;
2021  p->state = HB_NSTATE_UNKNOWN;
2022  p->score = 0;
2023  p->heartbeat_gap = 0;
2024  p->last_recv_hbtime.tv_sec = 0;
2025  p->last_recv_hbtime.tv_usec = 0;
2026 
2027  p->next = NULL;
2028  p->prev = NULL;
2029  first_pp = &hb_Cluster->nodes;
2030  hb_list_add ((HB_LIST **) first_pp, (HB_LIST *) p);
2031  }
2032 
2033  return (p);
2034 }
2035 
2036 /*
2037  * hb_remove_node() -
2038  * return: none
2039  *
2040  * entry_p(in):
2041  */
2042 static void
2044 {
2045  if (entry_p)
2046  {
2047  hb_list_remove ((HB_LIST *) entry_p);
2048  free_and_init (entry_p);
2049  }
2050  return;
2051 }
2052 
2053 /*
2054  * hb_cluster_remove_all_nodes() -
2055  * return: none
2056  *
2057  * first(in):
2058  */
2059 static void
2061 {
2062  HB_NODE_ENTRY *node, *next_node;
2063 
2064  for (node = first; node; node = next_node)
2065  {
2066  next_node = node->next;
2067  hb_remove_node (node);
2068  }
2069 }
2070 
2071 /*
2072  * hb_add_ping_host() -
2073  * return: pointer to ping host entry
2074  *
2075  * host_name(in):
2076  */
2077 static HB_PING_HOST_ENTRY *
2078 hb_add_ping_host (char *host_name)
2079 {
2081  HB_PING_HOST_ENTRY **first_pp;
2082 
2083  if (host_name == NULL)
2084  {
2085  return NULL;
2086  }
2087 
2088  p = (HB_PING_HOST_ENTRY *) malloc (sizeof (HB_PING_HOST_ENTRY));
2089  if (p)
2090  {
2091  strncpy (p->host_name, host_name, sizeof (p->host_name) - 1);
2092  p->host_name[sizeof (p->host_name) - 1] = '\0';
2094  p->next = NULL;
2095  p->prev = NULL;
2096 
2097  first_pp = &hb_Cluster->ping_hosts;
2098 
2099  hb_list_add ((HB_LIST **) first_pp, (HB_LIST *) p);
2100  }
2101 
2102  return (p);
2103 }
2104 
2105 /*
2106  * hb_remove_ping_host() -
2107  * return: none
2108  *
2109  * entry_p(in):
2110  */
2111 static void
2113 {
2114  if (entry_p)
2115  {
2116  hb_list_remove ((HB_LIST *) entry_p);
2117  free_and_init (entry_p);
2118  }
2119  return;
2120 }
2121 
2122 /*
2123  * hb_cluster_remove_all_ping_hosts() -
2124  * return: none
2125  *
2126  * first(in):
2127  */
2128 static void
2130 {
2131  HB_PING_HOST_ENTRY *host, *next_host;
2132 
2133  for (host = first; host; host = next_host)
2134  {
2135  next_host = host->next;
2136  hb_remove_ping_host (host);
2137  }
2138 }
2139 
2140 /*
2141  * hb_cluster_load_ping_host_list() -
2142  * return: number of ping hosts
2143  *
2144  * host_list(in):
2145  */
2146 static int
2147 hb_cluster_load_ping_host_list (char *ha_ping_host_list)
2148 {
2149  int num_hosts = 0;
2150  char host_list[LINE_MAX];
2151  char *host_list_p, *host_p, *host_pp;
2152 
2153  if (ha_ping_host_list == NULL)
2154  {
2155  return 0;
2156  }
2157 
2158  strncpy_bufsize (host_list, ha_ping_host_list);
2159 
2160  for (host_list_p = host_list;; host_list_p = NULL)
2161  {
2162  host_p = strtok_r (host_list_p, " ,:", &host_pp);
2163  if (host_p == NULL)
2164  {
2165  break;
2166  }
2167 
2168  hb_add_ping_host (host_p);
2169  num_hosts++;
2170  }
2171 
2172  return num_hosts;
2173 }
2174 
2175 /*
2176  * hb_return_node_by_name() -
2177  * return: pointer to heartbeat node entry
2178  *
2179  * name(in):
2180  */
2181 static HB_NODE_ENTRY *
2183 {
2184  HB_NODE_ENTRY *node;
2185 
2186  for (node = hb_Cluster->nodes; node; node = node->next)
2187  {
2188  if (!are_hostnames_equal (name, node->host_name))
2189  {
2190  continue;
2191  }
2192 
2193  return (node);
2194  }
2195 
2196  return NULL;
2197 }
2198 
2199 /*
2200  * hb_return_node_by_name_except_me() -
2201  * return: pointer to heartbeat node entry
2202  *
2203  * name(in):
2204  */
2205 static HB_NODE_ENTRY *
2207 {
2208  HB_NODE_ENTRY *node;
2209 
2210  for (node = hb_Cluster->nodes; node; node = node->next)
2211  {
2212  if (!are_hostnames_equal (name, node->host_name) || are_hostnames_equal (name, hb_Cluster->host_name))
2213  {
2214  continue;
2215  }
2216 
2217  return (node);
2218  }
2219 
2220  return NULL;
2221 }
2222 
2223 static int
2224 hb_is_heartbeat_valid (char *host_name, char *group_id, struct sockaddr_in *from)
2225 {
2226  int error;
2227  struct in_addr sin_addr;
2228  HB_NODE_ENTRY *node;
2229 
2230  node = hb_return_node_by_name_except_me (host_name);
2231  if (node == NULL)
2232  {
2234  }
2235 
2236  if (strcmp (group_id, hb_Cluster->group_id) != 0)
2237  {
2239  }
2240 
2241  error = hb_hostname_to_sin_addr (host_name, &sin_addr);
2242  if (error == NO_ERROR)
2243  {
2244  if (memcmp ((void *) &sin_addr, (void *) &from->sin_addr, sizeof (struct in_addr)) != 0)
2245  {
2247  }
2248  }
2249  else
2250  {
2252  }
2253 
2254  return HB_VALID_NO_ERROR;
2255 }
2256 
2257 /*
2258  * hb_valid_result_string() -
2259  */
2260 static const char *
2262 {
2263  switch (v_result)
2264  {
2265  case HB_VALID_NO_ERROR:
2266  return HB_VALID_NO_ERROR_STR;
2275  }
2276 
2277  assert (false);
2278  return "invalid";
2279 }
2280 
2281 /*
2282  * hb_return_ui_node() -
2283  * return: unidentified node pointer
2284  */
2285 static HB_UI_NODE_ENTRY *
2286 hb_return_ui_node (char *host_name, char *group_id, struct sockaddr_in saddr)
2287 {
2288  HB_UI_NODE_ENTRY *node = NULL;
2289 
2290  for (node = hb_Cluster->ui_nodes; node; node = node->next)
2291  {
2292  if (!are_hostnames_equal (node->host_name, host_name))
2293  {
2294  continue;
2295  }
2296 
2297  if (strcmp (node->group_id, group_id) != 0)
2298  {
2299  continue;
2300  }
2301 
2302  if (node->saddr.sin_addr.s_addr != saddr.sin_addr.s_addr)
2303  {
2304  continue;
2305  }
2306 
2307  break;
2308  }
2309 
2310  return node;
2311 }
2312 
2313 /*
2314  * hb_add_ui_node() -
2315  * return: added node pointer
2316  */
2317 static HB_UI_NODE_ENTRY *
2318 hb_add_ui_node (char *host_name, char *group_id, struct sockaddr_in saddr, int v_result)
2319 {
2320  HB_UI_NODE_ENTRY *node = NULL;
2321 
2323  || v_result == HB_VALID_IP_ADDR_MISMATCH || v_result == HB_VALID_CANNOT_RESOLVE_HOST);
2324 
2325  node = hb_return_ui_node (host_name, group_id, saddr);
2326  if (node)
2327  {
2328  return node;
2329  }
2330 
2331  node = (HB_UI_NODE_ENTRY *) malloc (sizeof (HB_UI_NODE_ENTRY));
2332  if (node)
2333  {
2334  strncpy_bufsize (node->host_name, host_name);
2335  strncpy_bufsize (node->group_id, group_id);
2336  memcpy ((void *) &node->saddr, (void *) &saddr, sizeof (struct sockaddr_in));
2337  gettimeofday (&node->last_recv_time, NULL);
2338  node->v_result = v_result;
2339 
2340  node->next = NULL;
2341  node->prev = NULL;
2342 
2343  hb_list_add ((HB_LIST **) (&hb_Cluster->ui_nodes), (HB_LIST *) node);
2344  hb_Cluster->num_ui_nodes++;
2345  }
2346 
2347  return node;
2348 }
2349 
2350 /*
2351  * hb_remove_ui_node() -
2352  * return: none
2353  */
2354 static void
2356 {
2357  if (node)
2358  {
2359  hb_list_remove ((HB_LIST *) node);
2360  free_and_init (node);
2361  hb_Cluster->num_ui_nodes--;
2362  if (hb_Cluster->num_ui_nodes < 0)
2363  {
2364  assert (0);
2365  hb_Cluster->num_ui_nodes = 0;
2366  }
2367  }
2368 }
2369 
2370 /*
2371  * hb_cleanup_ui_nodes() -
2372  * return: none
2373  */
2374 static void
2376 {
2377  HB_UI_NODE_ENTRY *node, *node_next;
2378  struct timeval now;
2379 
2380  gettimeofday (&now, NULL);
2381 
2382  for (node = first; node; node = node_next)
2383  {
2384  node_next = node->next;
2386  {
2387  hb_remove_ui_node (node);
2388  }
2389  node = NULL;
2390  }
2391 
2392  return;
2393 }
2394 
2395 /*
2396  * hb_cluster_remove_all_ui_nodes() -
2397  * return: none
2398  */
2399 static void
2401 {
2402  HB_UI_NODE_ENTRY *node, *node_next;
2403 
2404  for (node = first; node; node = node_next)
2405  {
2406  node_next = node->next;
2407  hb_remove_ui_node (node);
2408  node = NULL;
2409  }
2410 }
2411 
2412 /*
2413  * hb_cluster_load_group_and_node_list() -
2414  * return: number of cluster nodes
2415  *
2416  * host_list(in):
2417  */
2418 static int
2419 hb_cluster_load_group_and_node_list (char *ha_node_list, char *ha_replica_list)
2420 {
2421  int priority, num_nodes;
2422  char tmp_string[LINE_MAX];
2423  char *p, *savep;
2424  HB_NODE_ENTRY *node;
2425 
2426  if (ha_node_list == NULL)
2427  {
2428  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid ha_node_list. (ha_node_list:NULL).\n");
2429  return ER_FAILED;
2430  }
2431 
2432  hb_Cluster->myself = NULL;
2433 
2434  strncpy_bufsize (tmp_string, ha_node_list);
2435  for (priority = 0, p = strtok_r (tmp_string, "@", &savep); p; priority++, p = strtok_r (NULL, " ,:", &savep))
2436  {
2437 
2438  if (priority == 0)
2439  {
2440  /* TODO : trim group id */
2441  /* set heartbeat group id */
2442  strncpy_bufsize (hb_Cluster->group_id, p);
2443  }
2444  else
2445  {
2446  /* TODO : trim node name */
2447  node = hb_add_node_to_cluster (p, (priority));
2448  if (node)
2449  {
2450  if (are_hostnames_equal (node->host_name, hb_Cluster->host_name))
2451  {
2452  hb_Cluster->myself = node;
2453 #if defined (HB_VERBOSE_DEBUG)
2454  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "find myself node. (myself:%p, priority:%d). \n",
2455  hb_Cluster->myself, hb_Cluster->myself->priority);
2456 #endif
2457  }
2458  }
2459  }
2460  }
2461 
2462  if (hb_Cluster->state == HB_NSTATE_REPLICA && hb_Cluster->myself != NULL)
2463  {
2464  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "myself should be in the ha_replica_list. \n");
2465  return ER_FAILED;
2466  }
2467  num_nodes = priority;
2468 
2469  if (ha_replica_list)
2470  {
2471  strncpy_bufsize (tmp_string, ha_replica_list);
2472  }
2473  else
2474  {
2475  tmp_string[0] = '\0';
2476  }
2477  for (priority = 0, p = strtok_r (tmp_string, "@", &savep); p; priority++, p = strtok_r (NULL, " ,:", &savep))
2478  {
2479 
2480  if (priority == 0)
2481  {
2482  if (strcmp (hb_Cluster->group_id, p) != 0)
2483  {
2484  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "different group id ('ha_node_list', 'ha_replica_list') \n");
2485  return ER_FAILED;
2486  }
2487  }
2488  else
2489  {
2491  if (node)
2492  {
2493  if (are_hostnames_equal (node->host_name, hb_Cluster->host_name))
2494  {
2495  hb_Cluster->myself = node;
2496  hb_Cluster->state = HB_NSTATE_REPLICA;
2497  }
2498  }
2499  }
2500  }
2501 
2502  if (hb_Cluster->myself == NULL)
2503  {
2504  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "cannot find myself. \n");
2505  return ER_FAILED;
2506  }
2507 
2508  return num_nodes + priority;
2509 }
2510 
2511 
2512 
2513 /*
2514  * resource process job actions
2515  */
2516 
2517 /*
2518  * hb_resource_job_confirm_cleanup_all () - confirm that all HA processes are shutdown
2519  * for deactivating heartbeat
2520  * return: none
2521  *
2522  * arg(in):
2523  */
2524 static void
2526 {
2527  int rv, error;
2528  HB_RESOURCE_JOB_ARG *resource_job_arg;
2529  HB_PROC_ENTRY *proc, *proc_next;
2530  char error_string[LINE_MAX] = "";
2531  int num_connected_rsc = 0;
2532 
2533  resource_job_arg = (arg) ? &(arg->resource_job_arg) : NULL;
2534 
2535  if (arg == NULL || resource_job_arg == NULL)
2536  {
2537  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or resource_job_arg. (arg:%p, resource_job_arg:%p). \n", arg,
2538  resource_job_arg);
2539  return;
2540  }
2541 
2542  rv = pthread_mutex_lock (&hb_Resource->lock);
2543 
2544  if (++(resource_job_arg->retries) > resource_job_arg->max_retries || hb_Deactivate_immediately == true)
2545  {
2546  for (proc = hb_Resource->procs; proc; proc = proc_next)
2547  {
2548  assert (proc->state == HB_PSTATE_DEREGISTERED);
2549  assert (proc->pid > 0);
2550 
2551  proc_next = proc->next;
2552 
2553  if (proc->pid > 0 && (kill (proc->pid, 0) == 0 || errno != ESRCH))
2554  {
2555  snprintf (error_string, LINE_MAX, "(pid: %d, args:%s)", proc->pid, proc->args);
2556  if (hb_Deactivate_immediately == true)
2557  {
2559  "Immediate shutdown requested. Process killed", error_string);
2560  }
2561  else
2562  {
2564  "No response to shutdown request. Process killed", error_string);
2565  }
2566 
2567  kill (proc->pid, SIGKILL);
2568  }
2569 
2570  hb_Resource->num_procs--;
2571  hb_remove_proc (proc);
2572  proc = NULL;
2573  }
2574 
2575  assert (hb_Resource->num_procs == 0);
2576  goto end_confirm_cleanup;
2577  }
2578 
2579  for (proc = hb_Resource->procs; proc; proc = proc_next)
2580  {
2581  assert (proc->state == HB_PSTATE_DEREGISTERED);
2582  assert (proc->pid > 0);
2583 
2584  proc_next = proc->next;
2585 
2586  if (proc->type != HB_PTYPE_SERVER)
2587  {
2588  if (proc->pid > 0 && (kill (proc->pid, 0) == 0 || errno != ESRCH))
2589  {
2590  kill (proc->pid, SIGKILL);
2591 
2592  snprintf (error_string, LINE_MAX, "(pid: %d, args:%s)", proc->pid, proc->args);
2594  "No response to shutdown request. Process killed", error_string);
2595  }
2596  hb_Resource->num_procs--;
2597  hb_remove_proc (proc);
2598  proc = NULL;
2599  }
2600  else
2601  {
2602  if (proc->pid <= 0 || (kill (proc->pid, 0) && errno == ESRCH))
2603  {
2604  hb_Resource->num_procs--;
2605  hb_remove_proc (proc);
2606  proc = NULL;
2607  continue;
2608  }
2609  }
2610 
2611  if (proc && proc->conn != NULL)
2612  {
2613  num_connected_rsc++;
2614  }
2615 
2616  assert (hb_Resource->num_procs >= 0);
2617  }
2618 
2619  if (hb_Resource->num_procs == 0 || num_connected_rsc == 0)
2620  {
2621  goto end_confirm_cleanup;
2622  }
2623 
2624  pthread_mutex_unlock (&hb_Resource->lock);
2625 
2626  error =
2629 
2630  if (error != NO_ERROR)
2631  {
2632  assert (false);
2633  free_and_init (arg);
2634  }
2635 
2636  return;
2637 
2638 end_confirm_cleanup:
2639  pthread_mutex_unlock (&hb_Resource->lock);
2640 
2641  if (arg != NULL)
2642  {
2643  free_and_init (arg);
2644  }
2645 
2646  MASTER_ER_SET (ER_NOTIFICATION_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "ready to deactivate heartbeat", "");
2647  return;
2648 }
2649 
2650 /*
2651  * hb_resource_job_cleanup_all () - shutdown all HA processes including cub_server
2652  * for deactivating heartbeat
2653  * return: none
2654  *
2655  * arg(in):
2656  */
2657 static void
2659 {
2660  int rv, i, error;
2661  HB_PROC_ENTRY *proc;
2662  HB_JOB_ARG *job_arg;
2663  HB_RESOURCE_JOB_ARG *resource_job_arg;
2664 
2666  rv = pthread_mutex_lock (&hb_Resource->lock);
2667 
2668  if (hb_Deactivate_immediately == false)
2669  {
2670  /* register CUBRID server pid */
2671  hb_Deactivate_info.server_pid_list = (int *) calloc (hb_Resource->num_procs, sizeof (int));
2672 
2673  for (i = 0, proc = hb_Resource->procs; proc; proc = proc->next)
2674  {
2675  if (proc->conn && proc->type == HB_PTYPE_SERVER)
2676  {
2677  hb_Deactivate_info.server_pid_list[i] = proc->pid;
2678  i++;
2679  }
2680  }
2681 
2682  hb_Deactivate_info.server_count = i;
2683 
2684  assert (hb_Resource->num_procs >= i);
2685  }
2686 
2688 
2689  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
2690  if (job_arg == NULL)
2691  {
2692  pthread_mutex_unlock (&hb_Resource->lock);
2694 
2696 
2697  return;
2698  }
2699 
2700  resource_job_arg = &(job_arg->resource_job_arg);
2701  resource_job_arg->retries = 0;
2703  gettimeofday (&resource_job_arg->ftime, NULL);
2704 
2705  pthread_mutex_unlock (&hb_Resource->lock);
2707 
2709 
2710  if (error != NO_ERROR)
2711  {
2712  assert (false);
2713  free_and_init (job_arg);
2714  }
2715 
2716  return;
2717 }
2718 
2719 /*
2720  * hb_resource_job_proc_start () -
2721  * return: none
2722  *
2723  * arg(in):
2724  */
2725 static void
2727 {
2728  int error, rv;
2729  char error_string[LINE_MAX] = "";
2730  pid_t pid;
2731  struct timeval now;
2732  HB_PROC_ENTRY *proc;
2733  HB_RESOURCE_JOB_ARG *proc_arg = (arg) ? &(arg->resource_job_arg) : NULL;
2734  char *argv[HB_MAX_NUM_PROC_ARGV] = { NULL, }, *args;
2735 
2736  if (arg == NULL || proc_arg == NULL)
2737  {
2738  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
2739  return;
2740  }
2741 
2742  rv = pthread_mutex_lock (&hb_Resource->lock);
2743  proc = hb_return_proc_by_args (proc_arg->args);
2744  if (proc == NULL || proc->state == HB_PSTATE_DEREGISTERED)
2745  {
2746  pthread_mutex_unlock (&hb_Resource->lock);
2747  free_and_init (arg);
2748  return;
2749  }
2750 
2751  if (proc->being_shutdown)
2752  {
2753  assert (proc_arg->pid > 0);
2754  if (proc_arg->pid <= 0 || (kill (proc_arg->pid, 0) && errno == ESRCH))
2755  {
2756  proc->being_shutdown = false;
2757  }
2758  else
2759  {
2760  pthread_mutex_unlock (&hb_Resource->lock);
2762  if (error != NO_ERROR)
2763  {
2764  assert (false);
2765  free_and_init (arg);
2766  }
2767  return;
2768  }
2769  }
2770 
2771  gettimeofday (&now, NULL);
2772  if (HB_GET_ELAPSED_TIME (now, proc->frtime) < HB_PROC_RECOVERY_DELAY_TIME)
2773  {
2774  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "delay the restart of the process. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
2775 
2776  pthread_mutex_unlock (&hb_Resource->lock);
2778  if (error != NO_ERROR)
2779  {
2780  assert (false);
2781  free_and_init (arg);
2782  }
2783  return;
2784  }
2785 
2786  snprintf (error_string, LINE_MAX, "(args:%s)", proc->args);
2787  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Restart the process", error_string);
2788 
2789  args = strdup (proc->args);
2790  hb_proc_make_arg (argv, args);
2791 
2792  pid = fork ();
2793  if (pid < 0)
2794  {
2795  pthread_mutex_unlock (&hb_Resource->lock);
2796 
2798 
2800  if (error != NO_ERROR)
2801  {
2802  assert (false);
2803  free_and_init (arg);
2804  }
2805 
2806  free (args);
2807 
2808  return;
2809  }
2810  else if (pid == 0)
2811  {
2812 #if defined (HB_VERBOSE_DEBUG)
2814  "execute:{%s} arg[0]:{%s} arg[1]:{%s} arg[2]:{%s} "
2815  "arg[3]:{%s} arg{4}:{%s} arg[5]:{%s} arg[6]:{%s} " "arg[7]:{%s} arg[8]:{%s} arg[9]:{%s}.\n",
2816  proc->exec_path, (argv[0]) ? argv[0] : "", (argv[1]) ? argv[1] : "",
2817  (argv[2]) ? argv[2] : "", (argv[3]) ? argv[3] : "", (argv[4]) ? argv[4] : "",
2818  (argv[5]) ? argv[5] : "", (argv[6]) ? argv[6] : "", (argv[7]) ? argv[7] : "",
2819  (argv[8]) ? argv[8] : "", (argv[9]) ? argv[9] : "");
2820 #endif
2821  error = execv (proc->exec_path, argv);
2822  pthread_mutex_unlock (&hb_Resource->lock);
2823 
2824  free_and_init (arg);
2825  css_master_cleanup (SIGTERM);
2826  return;
2827  }
2828  else
2829  {
2830  proc->pid = pid;
2831  proc->state = HB_PSTATE_STARTED;
2832  gettimeofday (&proc->stime, NULL);
2833 
2834  free (args);
2835  }
2836 
2837  pthread_mutex_unlock (&hb_Resource->lock);
2838 
2839  error =
2842  if (error != NO_ERROR)
2843  {
2844  assert (false);
2845  free_and_init (arg);
2846  }
2847 
2848  return;
2849 }
2850 
2851 /*
2852  * hb_resource_job_proc_dereg() -
2853  * return: none
2854  *
2855  * arg(in):
2856  */
2857 static void
2859 {
2860  int error, rv;
2861  HB_PROC_ENTRY *proc;
2862  HB_RESOURCE_JOB_ARG *proc_arg = (arg) ? &(arg->resource_job_arg) : NULL;
2863  SOCKET_QUEUE_ENTRY *sock_entq;
2864  char buffer[MASTER_TO_SRV_MSG_SIZE];
2865 
2866  if (arg == NULL || proc_arg == NULL)
2867  {
2868  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
2869  return;
2870  }
2871 #if defined (HB_VERBOSE_DEBUG)
2872  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "deregister process. (pid:%d). \n", proc_arg->pid);
2873 #endif
2874 #if !defined(WINDOWS)
2876 #endif
2877  rv = pthread_mutex_lock (&hb_Resource->lock);
2878  proc = hb_return_proc_by_pid (proc_arg->pid);
2879  if (proc == NULL)
2880  {
2881  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "cannot find process entry. (unknown pid, pid:%d). \n", proc_arg->pid);
2882  pthread_mutex_unlock (&hb_Resource->lock);
2883 #if !defined(WINDOWS)
2885 #endif
2886 
2887  free_and_init (arg);
2888  return;
2889  }
2890 
2891  if (proc->state != HB_PSTATE_DEREGISTERED)
2892  {
2893  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid process state. (pid:%d, state:%d). \n", proc_arg->pid, proc->state);
2894  pthread_mutex_unlock (&hb_Resource->lock);
2895 #if !defined(WINDOWS)
2897 #endif
2898 
2899  free_and_init (arg);
2900  return;
2901  }
2902 
2903  if (proc->type == HB_PTYPE_SERVER)
2904  {
2906  assert_release (sock_entq == NULL || sock_entq->name != NULL);
2907  if (sock_entq == NULL || sock_entq->name == NULL)
2908  {
2909  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid process conn entry. (pid:%d). \n", proc_arg->pid);
2910  goto hb_resource_job_proc_dereg_end;
2911 
2912  }
2913  memset (buffer, 0, sizeof (buffer));
2914  snprintf (buffer, sizeof (buffer) - 1,
2916  sock_entq->name + 1, 0);
2917  css_process_start_shutdown (sock_entq, 0, buffer);
2918  }
2919  else
2920  {
2921  assert (proc->pid > 0);
2922  if (proc->pid <= 0 || (kill (proc->pid, SIGTERM) && errno == ESRCH))
2923  {
2924  hb_Resource->num_procs--;
2925  hb_remove_proc (proc);
2926  proc = NULL;
2927 
2928  pthread_mutex_unlock (&hb_Resource->lock);
2929 #if !defined(WINDOWS)
2931 #endif
2932  free_and_init (arg);
2933  return;
2934  }
2935  }
2936 
2937 hb_resource_job_proc_dereg_end:
2938  pthread_mutex_unlock (&hb_Resource->lock);
2939 #if !defined(WINDOWS)
2941 #endif
2942 
2943  error =
2946  if (error != NO_ERROR)
2947  {
2948  assert (false);
2949  free_and_init (arg);
2950  }
2951 
2952  return;
2953 }
2954 
2955 /*
2956  * hb_resource_demote_start_shutdown_server_proc() -
2957  * send shutdown request to server
2958  * return: none
2959  *
2960  */
2961 static void
2963 {
2964  HB_PROC_ENTRY *proc;
2965  SOCKET_QUEUE_ENTRY *sock_entq;
2966  char buffer[MASTER_TO_SRV_MSG_SIZE];
2967 
2968  for (proc = hb_Resource->procs; proc; proc = proc->next)
2969  {
2970  /* leave processes other than cub_server */
2972  {
2973  continue;
2974  }
2975  assert (proc->type == HB_PTYPE_SERVER);
2976 
2977  if (proc->server_hang)
2978  {
2979  /* terminate a hang server process immediately */
2980  assert (proc->pid > 0);
2981  if (proc->pid > 0 && (kill (proc->pid, 0) == 0 || errno != ESRCH))
2982  {
2983  kill (proc->pid, SIGKILL);
2984  }
2985  continue;
2986  }
2987 
2989  assert_release (sock_entq == NULL || sock_entq->name != NULL);
2990  if (sock_entq != NULL && sock_entq->name != NULL)
2991  {
2992  memset (buffer, 0, sizeof (buffer));
2993  snprintf (buffer, sizeof (buffer) - 1,
2995  sock_entq->name + 1, 0);
2996 
2997  css_process_start_shutdown (sock_entq, 0, buffer);
2998  proc->being_shutdown = true;
2999  }
3000  }
3001  return;
3002 }
3003 
3004 /*
3005  * hb_resource_demote_confirm_shutdown_server_proc() -
3006  * confirm that server process is shutdown
3007  * return: whether all active, to-be-active server proc's are shutdown
3008  *
3009  */
3010 static bool
3012 {
3013  HB_PROC_ENTRY *proc;
3014 
3015  for (proc = hb_Resource->procs; proc; proc = proc->next)
3016  {
3017  if (proc->server_hang)
3018  {
3019  /* don't wait for a hang server process that has already been terminated */
3020  continue;
3021  }
3022 
3024  {
3025  assert (proc->type == HB_PTYPE_SERVER);
3026  return false;
3027  }
3028  }
3029  return true;
3030 }
3031 
3032 /*
3033  * hb_resource_demote_kill_server_proc() -
3034  * kill server process in an active or to-be-active state
3035  * return: none
3036  *
3037  */
3038 static void
3040 {
3041  HB_PROC_ENTRY *proc;
3042  char error_string[LINE_MAX] = "";
3043 
3044  for (proc = hb_Resource->procs; proc; proc = proc->next)
3045  {
3047  {
3048  assert (proc->type == HB_PTYPE_SERVER);
3049  assert (proc->pid > 0);
3050  if (proc->pid > 0 && (kill (proc->pid, 0) == 0 || errno != ESRCH))
3051  {
3052  snprintf (error_string, LINE_MAX, "(pid: %d, args:%s)", proc->pid, proc->args);
3054  "No response to shutdown request. Process killed", error_string);
3055  kill (proc->pid, SIGKILL);
3056  }
3057  }
3058  }
3059 }
3060 
3061 /*
3062  * hb_resource_job_demote_confirm_shutdown() -
3063  * prepare for demoting master
3064  * it checks if every active server process is shutdown
3065  * if so, it assigns demote cluster job
3066  * return: none
3067  *
3068  * arg(in):
3069  */
3070 static void
3072 {
3073  int error, rv;
3074  HB_JOB_ARG *job_arg;
3075  HB_RESOURCE_JOB_ARG *proc_arg = (arg) ? &(arg->resource_job_arg) : NULL;
3076  HB_CLUSTER_JOB_ARG *clst_arg;
3077 
3078  if (arg == NULL || proc_arg == NULL)
3079  {
3080  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
3081  return;
3082  }
3083 
3084  rv = pthread_mutex_lock (&hb_Resource->lock);
3085 
3086  if (++(proc_arg->retries) > proc_arg->max_retries)
3087  {
3089  goto demote_confirm_shutdown_end;
3090  }
3091 
3093  {
3094  pthread_mutex_unlock (&hb_Resource->lock);
3095 
3096  error =
3099 
3100  assert (error == NO_ERROR);
3101 
3102  return;
3103  }
3104 
3105 demote_confirm_shutdown_end:
3106  pthread_mutex_unlock (&hb_Resource->lock);
3107 
3108  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
3109  if (job_arg == NULL)
3110  {
3112  if (arg)
3113  {
3114  free_and_init (arg);
3115  }
3116  css_master_cleanup (SIGTERM);
3117  return;
3118  }
3119 
3120  clst_arg = &(job_arg->cluster_job_arg);
3121  clst_arg->ping_check_count = 0;
3122  clst_arg->retries = 0;
3123 
3125 
3126  if (error != NO_ERROR)
3127  {
3128  assert (false);
3129  free_and_init (job_arg);
3130  }
3131 
3132  if (arg)
3133  {
3134  free_and_init (arg);
3135  }
3136 
3137  return;
3138 }
3139 
3140 /*
3141  * hb_resource_job_demote_start_shutdown() -
3142  * prepare for demoting master
3143  * it shuts down working active server processes
3144  * return: none
3145  *
3146  * arg(in):
3147  */
3148 static void
3150 {
3151  int error, rv;
3152  HB_JOB_ARG *job_arg;
3153  HB_RESOURCE_JOB_ARG *proc_arg;
3154 
3155 #if !defined(WINDOWS)
3157 #endif
3158  rv = pthread_mutex_lock (&hb_Resource->lock);
3159 
3161 
3162  rv = pthread_mutex_unlock (&hb_Resource->lock);
3163 #if !defined(WINDOWS)
3165 #endif
3166 
3167  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
3168  if (job_arg == NULL)
3169  {
3171  if (arg)
3172  {
3173  free_and_init (arg);
3174  }
3175  css_master_cleanup (SIGTERM);
3176  return;
3177  }
3178 
3179  proc_arg = &(job_arg->resource_job_arg);
3180  proc_arg->retries = 0;
3182  gettimeofday (&proc_arg->ftime, NULL);
3183 
3184  error =
3187  if (error != NO_ERROR)
3188  {
3189  assert (false);
3190  free_and_init (job_arg);
3191  }
3192 
3193  if (arg)
3194  {
3195  free_and_init (arg);
3196  }
3197  return;
3198 }
3199 
3200 /*
3201  * hb_resource_job_confirm_start() -
3202  * return: none
3203  *
3204  * arg(in):
3205  */
3206 static void
3208 {
3209  int error, rv;
3210  char error_string[LINE_MAX] = "";
3211  bool retry = true;
3212  HB_PROC_ENTRY *proc;
3213  HB_RESOURCE_JOB_ARG *proc_arg = (arg) ? &(arg->resource_job_arg) : NULL;
3214  char hb_info_str[HB_INFO_STR_MAX];
3215 
3216  if (arg == NULL || proc_arg == NULL)
3217  {
3218  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
3219  return;
3220  }
3221 
3222  rv = pthread_mutex_lock (&hb_Resource->lock);
3223  proc = hb_return_proc_by_args (proc_arg->args);
3224  if (proc == NULL || proc->state == HB_PSTATE_DEREGISTERED)
3225  {
3226  pthread_mutex_unlock (&hb_Resource->lock);
3227  free_and_init (arg);
3228  return;
3229  }
3230 
3231  if (++(proc_arg->retries) > proc_arg->max_retries)
3232  {
3233  snprintf (error_string, LINE_MAX, "(exceed max retry count for pid: %d, args:%s)", proc->pid, proc->args);
3234 
3235  if (hb_Resource->state == HB_NSTATE_MASTER && proc->type == HB_PTYPE_SERVER && hb_Cluster->is_isolated == false)
3236  {
3237  hb_Resource->state = HB_NSTATE_SLAVE;
3238  pthread_mutex_unlock (&hb_Resource->lock);
3239 
3241  "Failed to restart the process " "and the current node will be demoted", error_string);
3242 
3243  /* keep checking problematic process */
3244  proc_arg->retries = 0;
3245  error =
3248  if (error != NO_ERROR)
3249  {
3250  free_and_init (arg);
3251  assert (false);
3252  }
3253 
3254  /* shutdown working server processes to change its role to slave */
3256  assert (error == NO_ERROR);
3257 
3258  return;
3259  }
3260  else
3261  {
3262  pthread_mutex_unlock (&hb_Resource->lock);
3264  "Keep checking to confirm the completion of the process startup", error_string);
3265  proc_arg->retries = 0;
3266  error =
3269  if (error != NO_ERROR)
3270  {
3271  assert (false);
3272  free_and_init (arg);
3273  }
3274  return;
3275  }
3276  }
3277 
3278  assert (proc->pid > 0);
3279  error = kill (proc->pid, 0);
3280  if (error)
3281  {
3282  pthread_mutex_unlock (&hb_Resource->lock);
3283  if (errno == ESRCH)
3284  {
3285  snprintf (error_string, LINE_MAX, "(process not found, expected pid: %d, args:%s)", proc->pid, proc->args);
3286  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Failed to restart process",
3287  error_string);
3288 
3290  if (error != NO_ERROR)
3291  {
3292  assert (false);
3293  free_and_init (arg);
3294  }
3295  }
3296  else
3297  {
3298  error =
3301  if (error != NO_ERROR)
3302  {
3303  assert (false);
3304  free_and_init (arg);
3305  }
3306  }
3307  return;
3308  }
3309 
3310  if (proc->state == HB_PSTATE_NOT_REGISTERED)
3311  {
3312  if (proc->type == HB_PTYPE_SERVER)
3313  {
3315  }
3316  else
3317  {
3318  proc->state = HB_PSTATE_REGISTERED;
3319  }
3320 
3321  retry = false;
3322  }
3323 
3324  pthread_mutex_unlock (&hb_Resource->lock);
3325 
3327  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "%s", hb_info_str);
3328 
3329  if (retry)
3330  {
3331  error =
3334  if (error != NO_ERROR)
3335  {
3336  assert (false);
3337  free_and_init (arg);
3338  }
3339  return;
3340  }
3341 
3342  free_and_init (arg);
3343 
3344  return;
3345 }
3346 
3347 /*
3348  * hb_resource_job_confirm_dereg() -
3349  * return: none
3350  *
3351  * arg(in):
3352  */
3353 static void
3355 {
3356  int error, rv;
3357  bool retry = true;
3358  HB_PROC_ENTRY *proc;
3359  HB_RESOURCE_JOB_ARG *proc_arg = (arg) ? &(arg->resource_job_arg) : NULL;
3360 
3361  if (arg == NULL || proc_arg == NULL)
3362  {
3363  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid arg or proc_arg. (arg:%p, proc_arg:%p). \n", arg, proc_arg);
3364  return;
3365  }
3366 
3367 #if defined (HB_VERBOSE_DEBUG)
3368  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "deregister confirm process. (pid:%d, args:{%s}). \n", proc_arg->pid,
3369  proc_arg->args);
3370 #endif
3371 
3372  rv = pthread_mutex_lock (&hb_Resource->lock);
3373  proc = hb_return_proc_by_pid (proc_arg->pid);
3374  if (proc == NULL)
3375  {
3376  pthread_mutex_unlock (&hb_Resource->lock);
3377  free_and_init (arg);
3378  return;
3379  }
3380 
3381  if (proc->state != HB_PSTATE_DEREGISTERED)
3382  {
3383  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid process state. (pid:%d, state:%d). \n", proc_arg->pid, proc->state);
3384  pthread_mutex_unlock (&hb_Resource->lock);
3385 
3386  free_and_init (arg);
3387  return;
3388  }
3389 
3390  error = kill (proc->pid, 0);
3391  if (error)
3392  {
3393  if (errno == ESRCH)
3394  {
3395  retry = false;
3396  }
3397  }
3398  else
3399  {
3400  if (++(proc_arg->retries) > proc_arg->max_retries)
3401  {
3402  assert (proc->pid > 0);
3403  if (proc->pid > 0)
3404  {
3405  kill (proc->pid, SIGKILL);
3406  }
3407  retry = false;
3408  }
3409  }
3410 
3411  if (retry)
3412  {
3413  pthread_mutex_unlock (&hb_Resource->lock);
3414  error =
3417  if (error != NO_ERROR)
3418  {
3419  assert (false);
3420  free_and_init (arg);
3421  }
3422  return;
3423  }
3424 
3425  hb_Resource->num_procs--;
3426  hb_remove_proc (proc);
3427  proc = NULL;
3428 
3429  pthread_mutex_unlock (&hb_Resource->lock);
3430 
3431  free_and_init (arg);
3432 
3433  return;
3434 }
3435 
3436 /*
3437  * hb_resource_job_change_mode() -
3438  * return: none
3439  *
3440  * arg(in):
3441  */
3442 static void
3444 {
3445  int error, rv;
3446  HB_PROC_ENTRY *proc;
3447  char hb_info_str[HB_INFO_STR_MAX];
3448 
3449 #if !defined(WINDOWS)
3451 #endif
3452  rv = pthread_mutex_lock (&hb_Resource->lock);
3453  for (proc = hb_Resource->procs; proc; proc = proc->next)
3454  {
3455  if (proc->type != HB_PTYPE_SERVER)
3456  {
3457  continue;
3458  }
3459 
3460  if ((hb_Resource->state == HB_NSTATE_MASTER
3462  || (hb_Resource->state == HB_NSTATE_TO_BE_SLAVE
3465  {
3466  /* TODO : send heartbeat changemode request */
3467  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "send change-mode request. " "(node_state:%d, pid:%d, proc_state:%d). \n",
3468  hb_Resource->state, proc->pid, proc->state);
3469 
3470  error = hb_resource_send_changemode (proc);
3471  if (NO_ERROR != error)
3472  {
3473  /* TODO : if error */
3474  }
3475  }
3476  }
3477 
3478  if (hb_Resource->procs)
3479  {
3481  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "%s", hb_info_str);
3482  }
3483 
3484  pthread_mutex_unlock (&hb_Resource->lock);
3485 #if !defined(WINDOWS)
3487 #endif
3488 
3489  error =
3491  assert (error == NO_ERROR);
3492 
3493  if (arg)
3494  {
3495  free_and_init (arg);
3496  }
3497  return;
3498 }
3499 
3500 /*
3501  * resource job queue
3502  */
3503 
3504 /*
3505  * hb_resource_job_dequeue() -
3506  * return: pointer to resource job entry
3507  *
3508  */
3509 static HB_JOB_ENTRY *
3511 {
3512  return hb_job_dequeue (resource_Jobs);
3513 }
3514 
3515 /*
3516  * hb_resource_job_queue() -
3517  * return: NO_ERROR or ER_FAILED
3518  *
3519  * job_type(in):
3520  * arg(in):
3521  * msec(in):
3522  */
3523 static int
3524 hb_resource_job_queue (unsigned int job_type, HB_JOB_ARG * arg, unsigned int msec)
3525 {
3526  if (job_type >= HB_RJOB_MAX)
3527  {
3528  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unknown job type. (job_type:%d).\n", job_type);
3529  return ER_FAILED;
3530  }
3531 
3532  return hb_job_queue (resource_Jobs, job_type, arg, msec);
3533 }
3534 
3535 /*
3536  * hb_resource_job_set_expire_and_reorder() -
3537  * return: NO_ERROR or ER_FAILED
3538  *
3539  * job_type(in):
3540  * msec(in):
3541  */
3542 static int
3543 hb_resource_job_set_expire_and_reorder (unsigned int job_type, unsigned int msec)
3544 {
3545  if (job_type >= HB_RJOB_MAX)
3546  {
3547  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unknown job type. (job_type:%d).\n", job_type);
3548  return ER_FAILED;
3549  }
3550 
3551  hb_job_set_expire_and_reorder (resource_Jobs, job_type, msec);
3552 
3553  return NO_ERROR;
3554 }
3555 
3556 /*
3557  * hb_resource_job_shutdown() -
3558  * return: none
3559  *
3560  */
3561 static void
3563 {
3564  return hb_job_shutdown (resource_Jobs);
3565 }
3566 
3567 /*
3568  * resource process
3569  */
3570 
3571 /*
3572  * hb_alloc_new_proc() -
3573  * return: pointer to resource process entry
3574  *
3575  */
3576 static HB_PROC_ENTRY *
3578 {
3579  HB_PROC_ENTRY *p;
3580  HB_PROC_ENTRY **first_pp;
3581 
3582  p = (HB_PROC_ENTRY *) malloc (sizeof (HB_PROC_ENTRY));
3583  if (p)
3584  {
3585  memset ((void *) p, 0, sizeof (HB_PROC_ENTRY));
3586  p->state = HB_PSTATE_UNKNOWN;
3587  p->next = NULL;
3588  p->prev = NULL;
3589  p->being_shutdown = false;
3590  p->server_hang = false;
3591  p->is_curr_eof_received = false;
3592  LSA_SET_NULL (&p->prev_eof);
3593  LSA_SET_NULL (&p->curr_eof);
3594 
3595  first_pp = &hb_Resource->procs;
3596  hb_list_add ((HB_LIST **) first_pp, (HB_LIST *) p);
3597  }
3598 
3599  return (p);
3600 }
3601 
3602 /*
3603  * hb_remove_proc() -
3604  * return: none
3605  *
3606  * entry_p(in):
3607  */
3608 static void
3610 {
3611  if (entry_p)
3612  {
3613  hb_list_remove ((HB_LIST *) entry_p);
3614  free_and_init (entry_p);
3615  }
3616  return;
3617 }
3618 
3619 /*
3620  * hb_remove_all_procs() -
3621  * return: none
3622  *
3623  * first(in):
3624  */
3625 static void
3627 {
3628  HB_PROC_ENTRY *proc, *next_proc;
3629 
3630  for (proc = first; proc; proc = next_proc)
3631  {
3632  next_proc = proc->next;
3633  hb_remove_proc (proc);
3634  }
3635 }
3636 
3637 /*
3638  * hb_return_proc_by_args() -
3639  * return: pointer to resource process entry
3640  *
3641  * args(in):
3642  */
3643 static HB_PROC_ENTRY *
3645 {
3646  HB_PROC_ENTRY *proc;
3647 
3648  for (proc = hb_Resource->procs; proc; proc = proc->next)
3649  {
3650  if (strcmp (proc->args, args))
3651  {
3652  continue;
3653  }
3654  return proc;
3655  }
3656  return NULL;
3657 }
3658 
3659 /*
3660  * hb_return_proc_by_pid() -
3661  * return: pointer to resource process entry
3662  *
3663  * sfd(in):
3664  */
3665 static HB_PROC_ENTRY *
3667 {
3668  HB_PROC_ENTRY *proc;
3669 
3670  for (proc = hb_Resource->procs; proc; proc = proc->next)
3671  {
3672  if (proc->pid != pid)
3673  {
3674  continue;
3675  }
3676  return proc;
3677 
3678  }
3679  return NULL;
3680 }
3681 
3682 /*
3683  * hb_return_proc_by_fd() -
3684  * return: pointer to resource process entry
3685  *
3686  * sfd(in):
3687  */
3688 static HB_PROC_ENTRY *
3690 {
3691  HB_PROC_ENTRY *proc;
3692 
3693  for (proc = hb_Resource->procs; proc; proc = proc->next)
3694  {
3695  if (proc->sfd != sfd)
3696  {
3697  continue;
3698  }
3699  return proc;
3700 
3701  }
3702  return NULL;
3703 }
3704 
3705 /*
3706  * hb_proc_make_arg() -
3707  * return: none
3708  *
3709  * arg(out):
3710  * argv(in):
3711  */
3712 static void
3713 hb_proc_make_arg (char **arg, char *args)
3714 {
3715  char *tok, *save;
3716 
3717  tok = strtok_r (args, " \t\n", &save);
3718 
3719  while (tok)
3720  {
3721  (*arg++) = tok;
3722  tok = strtok_r (NULL, " \t\n", &save);
3723  }
3724 
3725  return;
3726 }
3727 
3728 
3729 /*
3730  * resource process connection
3731  */
3732 
3733 /*
3734  * hb_cleanup_conn_and_start_process() -
3735  * return: none
3736  *
3737  * sfd(in):
3738  * conn(in):
3739  */
3740 void
3742 {
3743  int error, rv;
3744  char error_string[LINE_MAX] = "";
3745  HB_PROC_ENTRY *proc;
3746  HB_JOB_ARG *job_arg;
3747  HB_RESOURCE_JOB_ARG *proc_arg;
3748 
3750 
3751  if (hb_Resource == NULL)
3752  {
3753  return;
3754  }
3755 
3756  rv = pthread_mutex_lock (&hb_Resource->lock);
3757  proc = hb_return_proc_by_fd (sfd);
3758  if (proc == NULL)
3759  {
3760  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "cannot find process. (fd:%d). \n", sfd);
3761  pthread_mutex_unlock (&hb_Resource->lock);
3762  return;
3763  }
3764 
3765  proc->conn = NULL;
3766  proc->sfd = INVALID_SOCKET;
3767 
3768  if (proc->state < HB_PSTATE_REGISTERED)
3769  {
3770  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "unexpected process's state. " "(fd:%d, pid:%d, state:%d, args:{%s}). \n",
3771  sfd, proc->pid, proc->state, proc->args);
3772  /*
3773  * Do not delete process entry.
3774  * process entry will be removed by resource job.
3775  */
3776 
3777  pthread_mutex_unlock (&hb_Resource->lock);
3778  return;
3779  }
3780 
3781  gettimeofday (&proc->ktime, NULL);
3782 #if defined (HB_VERBOSE_DEBUG)
3783  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "process terminated. (args:{%s}, pid:%d, state:%d). \n", proc->args, proc->pid,
3784  proc->state);
3785 #endif
3786 
3787  snprintf (error_string, LINE_MAX, "(pid:%d, args:%s)", proc->pid, proc->args);
3788 
3789  if (proc->being_shutdown)
3790  {
3791  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Process shutdown detected",
3792  error_string);
3793  }
3794  else
3795  {
3796  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Process failure detected",
3797  error_string);
3798  }
3799 
3800  if (hb_Resource->state == HB_NSTATE_MASTER && proc->type == HB_PTYPE_SERVER && hb_Cluster->is_isolated == false)
3801  {
3802  if (HB_GET_ELAPSED_TIME (proc->ktime, proc->rtime) <
3804  {
3805  /* demote the current node */
3806  hb_Resource->state = HB_NSTATE_SLAVE;
3807 
3808  snprintf (error_string, LINE_MAX, "(args:%s)", proc->args);
3810  "Process failure repeated within a short period of time. " "The current node will be demoted",
3811  error_string);
3812 
3813  /* shutdown working server processes to change its role to slave */
3815  assert (error == NO_ERROR);
3816  }
3817  }
3818 
3819  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
3820  if (job_arg == NULL)
3821  {
3822  pthread_mutex_unlock (&hb_Resource->lock);
3824  return;
3825  }
3826 
3827  proc_arg = &(job_arg->resource_job_arg);
3828  proc_arg->pid = proc->pid;
3829  memcpy ((void *) &proc_arg->args[0], proc->args, sizeof (proc_arg->args));
3830  proc_arg->retries = 0;
3832  gettimeofday (&proc_arg->ftime, NULL);
3833 
3834  proc->state = HB_PSTATE_DEAD;
3835  proc->server_hang = false;
3836  proc->is_curr_eof_received = false;
3837  LSA_SET_NULL (&proc->prev_eof);
3838  LSA_SET_NULL (&proc->curr_eof);
3839 
3840  pthread_mutex_unlock (&hb_Resource->lock);
3841 
3843  assert (error == NO_ERROR);
3844 
3845  return;
3846 }
3847 
3848 /*
3849  * hb_is_regiestered_process() -
3850  * return: none
3851  *
3852  * conn(in):
3853  */
3854 bool
3856 {
3857  HB_PROC_ENTRY *proc;
3858 
3859  if (hb_Resource == NULL)
3860  {
3861  return false;
3862  }
3863 
3864  (void) pthread_mutex_lock (&hb_Resource->lock);
3865  if (hb_Resource->shutdown)
3866  {
3867  pthread_mutex_unlock (&hb_Resource->lock);
3868  return false;
3869  }
3870 
3871  proc = hb_return_proc_by_args (args);
3872  (void) pthread_mutex_unlock (&hb_Resource->lock);
3873 
3874  if (proc == NULL)
3875  {
3876  return false;
3877  }
3878 
3879  return true;
3880 }
3881 
3882 /*
3883  * hb_register_new_process() -
3884  * return: none
3885  *
3886  * conn(in):
3887  * rid(in):
3888  */
3889 void
3891 {
3892  int rv, buffer_size;
3894  HB_PROC_ENTRY *proc;
3895  unsigned char proc_state = HB_PSTATE_UNKNOWN;
3896  char buffer[HB_BUFFER_SZ];
3897  char error_string[LINE_MAX] = "";
3898 
3899  if (hb_Resource == NULL)
3900  {
3901  return;
3902  }
3903 
3904  buffer_size = sizeof (HBP_PROC_REGISTER);
3905 
3906  rv = css_receive_heartbeat_data (conn, buffer, buffer_size);
3907  if (rv != NO_ERRORS)
3908  {
3909  return;
3910  }
3911 
3912  hbp_proc_register = (HBP_PROC_REGISTER *) buffer;
3913 
3914  rv = pthread_mutex_lock (&hb_Resource->lock);
3915  if (hb_Resource->shutdown)
3916  {
3917  pthread_mutex_unlock (&hb_Resource->lock);
3919  return;
3920  }
3921 
3922  proc = hb_return_proc_by_args (hbp_proc_register->args);
3923  if (proc == NULL)
3924  {
3925  proc = hb_alloc_new_proc ();
3926  if (proc == NULL)
3927  {
3928  pthread_mutex_unlock (&hb_Resource->lock);
3930  return;
3931  }
3932  else
3933  {
3934  proc_state = HB_PSTATE_REGISTERED; /* first register */
3935  gettimeofday (&proc->frtime, NULL);
3936  }
3937  }
3938  else
3939  {
3940  proc_state = (proc->state == HB_PSTATE_STARTED) ? HB_PSTATE_NOT_REGISTERED
3941  /* restarted by heartbeat */ :
3942  HB_PSTATE_UNKNOWN /* already registered */ ;
3943  }
3944 
3945  if ((proc_state == HB_PSTATE_REGISTERED)
3946  || (proc_state == HB_PSTATE_NOT_REGISTERED && proc->pid == (int) ntohl (hbp_proc_register->pid)
3947  && !(kill (proc->pid, 0) && errno == ESRCH)))
3948  {
3949  proc->state = proc_state;
3950  proc->sfd = conn->fd;
3951  proc->conn = conn;
3952  gettimeofday (&proc->rtime, NULL);
3953  proc->changemode_gap = 0;
3954  proc->server_hang = false;
3955 
3956  if (proc->state == HB_PSTATE_REGISTERED)
3957  {
3958  proc->pid = ntohl (hbp_proc_register->pid);
3959  proc->type = ntohl (hbp_proc_register->type);
3960  if (proc->type == HB_PTYPE_SERVER)
3961  {
3963  }
3964  memcpy ((void *) &proc->exec_path[0], (void *) &hbp_proc_register->exec_path[0], sizeof (proc->exec_path));
3965  memcpy ((void *) &proc->args[0], (void *) &hbp_proc_register->args[0], sizeof (proc->args));
3966  hb_Resource->num_procs++;
3967  }
3968 
3969  assert (proc->pid > 0);
3970 
3971 #if defined (HB_VERBOSE_DEBUG)
3973  "hbp_proc_register. (sizeof(hbp_proc_register):%d, \n"
3974  "type:%d, state:%d, pid:%d, exec_path:{%s}, " "args:{%s}). \n", sizeof (HBP_PROC_REGISTER),
3975  proc->type, proc->state, proc->pid, proc->exec_path, proc->args);
3976  hb_print_procs ();
3977 #endif
3978 
3979  snprintf (error_string, LINE_MAX, "%s (pid:%d, state:%s, args:%s)", HB_RESULT_SUCCESS_STR,
3980  ntohl (hbp_proc_register->pid), hb_process_state_string (proc->type, proc->state),
3981  hbp_proc_register->args);
3982  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Registered as local process entries",
3983  error_string);
3984 
3985  pthread_mutex_unlock (&hb_Resource->lock);
3986  return;
3987  }
3988 
3989  pthread_mutex_unlock (&hb_Resource->lock);
3990 
3991  snprintf (error_string, LINE_MAX, "%s (expected pid: %d, pid:%d, state:%s, args:%s)", HB_RESULT_FAILURE_STR,
3992  proc->pid, ntohl (hbp_proc_register->pid), hb_process_state_string (proc->type, proc->state),
3993  hbp_proc_register->args);
3994  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Registered as local process entries",
3995  error_string);
3996 
3998  return;
3999 }
4000 
4001 /*
4002  * hb_resource_send_changemode -
4003  * return: none
4004  *
4005  * proc(in):
4006  */
4007 static int
4009 {
4010  int error = NO_ERROR;
4011  HA_SERVER_STATE state;
4012  int nstate;
4013  int sig = 0;
4014  char error_string[LINE_MAX] = "";
4015 
4016  if (proc->conn == NULL)
4017  {
4018  return ER_FAILED;
4019  }
4020 
4022  {
4023  sig = SIGTERM;
4024  }
4026  {
4027  sig = SIGKILL;
4028  }
4029 
4030  if (sig)
4031  {
4032  assert (proc->pid > 0);
4033  if (proc->pid > 0 && (kill (proc->pid, 0) == 0 || errno != ESRCH))
4034  {
4035  snprintf (error_string, sizeof (error_string),
4036  "process does not respond for a long time. kill pid %d signal %d.", proc->pid, sig);
4037  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_PROCESS_EVENT, 2, "Process failure detected",
4038  error_string);
4039  kill (proc->pid, sig);
4040  }
4041  return ER_FAILED;
4042  }
4043 
4044  switch (hb_Resource->state)
4045  {
4046  case HB_NSTATE_MASTER:
4047  {
4048  state = HA_SERVER_STATE_ACTIVE;
4049  }
4050  break;
4051  case HB_NSTATE_TO_BE_SLAVE:
4052  {
4053  state = HA_SERVER_STATE_STANDBY;
4054  }
4055  break;
4056  case HB_NSTATE_SLAVE:
4057  default:
4058  {
4059  return ER_FAILED;
4060  }
4061  break;
4062  }
4063 
4065  if (NO_ERRORS != error)
4066  {
4067  return ER_FAILED;
4068  }
4069 
4070  nstate = htonl ((int) state);
4071  error = css_send_heartbeat_data (proc->conn, (char *) &nstate, sizeof (nstate));
4072  if (NO_ERRORS != error)
4073  {
4074  snprintf (error_string, LINE_MAX,
4075  "Failed to send changemode request to the server. " "(state:%d[%s], args:[%s], pid:%d)", state,
4076  css_ha_server_state_string (state), proc->args, proc->pid);
4078 
4079  return ER_FAILED;
4080  }
4081 
4082  snprintf (error_string, LINE_MAX, "Send changemode request to the server. " "(state:%d[%s], args:[%s], pid:%d)",
4083  state, css_ha_server_state_string (state), proc->args, proc->pid);
4085 
4086  return NO_ERROR;
4087 }
4088 
4089 /*
4090  * hb_resource_receive_changemode -
4091  * return: none
4092  *
4093  * conn(in):
4094  */
4095 void
4097 {
4098  int sfd, rv;
4099  HB_PROC_ENTRY *proc;
4100  HA_SERVER_STATE state;
4101  int nstate;
4102  char error_string[LINE_MAX] = "";
4103 
4104  if (hb_Resource == NULL)
4105  {
4106  return;
4107  }
4108 
4109  rv = css_receive_heartbeat_data (conn, (char *) &nstate, sizeof (nstate));
4110  if (rv != NO_ERRORS)
4111  {
4112  return;
4113  }
4114  state = (HA_SERVER_STATE) ntohl (nstate);
4115 
4116  sfd = conn->fd;
4117  rv = pthread_mutex_lock (&hb_Cluster->lock);
4118  rv = pthread_mutex_lock (&hb_Resource->lock);
4119  proc = hb_return_proc_by_fd (sfd);
4120  if (proc == NULL || proc->state == HB_PSTATE_DEREGISTERED)
4121  {
4122  pthread_mutex_unlock (&hb_Resource->lock);
4123  pthread_mutex_unlock (&hb_Cluster->lock);
4124  return;
4125  }
4126 
4127  snprintf (error_string, LINE_MAX, "Receive changemode response from the server. " "(state:%d[%s], args:[%s], pid:%d)",
4128  state, css_ha_server_state_string (state), proc->args, proc->pid);
4130 
4131  switch (state)
4132  {
4135  break;
4136 
4139  break;
4140 
4143  hb_Cluster->state = HB_NSTATE_SLAVE;
4144  hb_Resource->state = HB_NSTATE_SLAVE;
4145  break;
4146 
4149  break;
4150 
4151  default:
4152  break;
4153  }
4154 
4155  proc->changemode_gap = 0;
4156 
4157  pthread_mutex_unlock (&hb_Resource->lock);
4158  pthread_mutex_unlock (&hb_Cluster->lock);
4159 
4160  return;
4161 }
4162 
4163 /*
4164  * hb_resource_check_server_log_grow() -
4165  * check if active server is alive
4166  * return: none
4167  *
4168  */
4169 static bool
4171 {
4172  int dead_cnt = 0;
4173  HB_PROC_ENTRY *proc;
4174 
4175  for (proc = hb_Resource->procs; proc; proc = proc->next)
4176  {
4177  if (proc->type != HB_PTYPE_SERVER || proc->state != HB_PSTATE_REGISTERED_AND_ACTIVE || proc->server_hang == true)
4178  {
4179  continue;
4180  }
4181 
4182  if (LSA_ISNULL (&proc->curr_eof) == true)
4183  {
4184  continue;
4185  }
4186 
4187  if (LSA_GT (&proc->curr_eof, &proc->prev_eof) == true)
4188  {
4189  LSA_COPY (&proc->prev_eof, &proc->curr_eof);
4190  }
4191  else
4192  {
4193  proc->server_hang = true;
4194  dead_cnt++;
4195 
4196  if (proc->is_curr_eof_received)
4197  {
4198 #if !defined(WINDOWS)
4199  syslog (LOG_ALERT, "[CUBRID] no change to eof [%lld|%d] received from (pid:%d)",
4200  LSA_AS_ARGS (&proc->curr_eof), proc->pid);
4201 #endif
4202  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "no change to eof [%lld|%d] received from (pid:%d)\n",
4203  LSA_AS_ARGS (&proc->curr_eof), proc->pid);
4204  }
4205  else
4206  {
4207 #if !defined(WINDOWS)
4208  syslog (LOG_ALERT, "[CUBRID] no response to eof request from (pid:%d)", proc->pid);
4209 #endif
4210  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "no response to eof request from (pid:%d)\n", proc->pid);
4211  }
4212  }
4213  }
4214  if (dead_cnt > 0)
4215  {
4216  return false;
4217  }
4218 
4219  return true;
4220 }
4221 
4222 /*
4223  * hb_resource_send_get_eof -
4224  * return: none
4225  *
4226  * proc(in):
4227  */
4228 static void
4230 {
4231  HB_PROC_ENTRY *proc;
4232 
4233  if (hb_Resource->state != HB_NSTATE_MASTER)
4234  {
4235  return;
4236  }
4237 
4238  for (proc = hb_Resource->procs; proc; proc = proc->next)
4239  {
4241  {
4243  proc->is_curr_eof_received = false;
4244  }
4245  }
4246 
4247  return;
4248 }
4249 
4250 /*
4251  * hb_resource_receive_get_eof -
4252  * return: none
4253  *
4254  * conn(in):
4255  */
4256 void
4258 {
4259  int rv;
4260  HB_PROC_ENTRY *proc;
4262  char *reply;
4263 
4264  reply = OR_ALIGNED_BUF_START (a_reply);
4265 
4266  rv = css_receive_heartbeat_data (conn, reply, OR_ALIGNED_BUF_SIZE (a_reply));
4267  if (rv != NO_ERRORS)
4268  {
4269  return;
4270  }
4271 
4272  rv = pthread_mutex_lock (&hb_Resource->lock);
4273 
4274  proc = hb_return_proc_by_fd (conn->fd);
4275  if (proc == NULL)
4276  {
4277  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "cannot find process. (fd:%d). \n", conn->fd);
4278  pthread_mutex_unlock (&hb_Resource->lock);
4279  return;
4280  }
4281 
4283  {
4284  or_unpack_log_lsa (reply, &proc->curr_eof);
4285  proc->is_curr_eof_received = true;
4286  }
4287 
4288  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "received eof [%lld|%d]\n", LSA_AS_ARGS (&proc->curr_eof));
4289 
4290  pthread_mutex_unlock (&hb_Resource->lock);
4291 
4292  return;
4293 }
4294 
4295 
4296 /*
4297  * heartbeat worker threads
4298  */
4299 
4300 /*
4301  * hb_thread_cluster_worker -
4302  * return: none
4303  *
4304  * arg(in):
4305  */
4306 #if defined(WINDOWS)
4307 static unsigned __stdcall
4308 hb_thread_cluster_worker (void *arg)
4309 #else
4310 static void *
4312 #endif
4313 {
4314  HB_JOB_ENTRY *job;
4315  /* *INDENT-OFF* */
4316  cuberr::context er_context (true);
4317  /* *INDENT-ON* */
4318 
4319 #if defined (HB_VERBOSE_DEBUG)
4320  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread started. (thread:{%s}, tid:%d).\n", __func__, THREAD_ID ());
4321 #endif
4322 
4323  while (cluster_Jobs->shutdown == false)
4324  {
4325  while ((job = hb_cluster_job_dequeue ()) != NULL)
4326  {
4327  job->func (job->arg);
4328  free_and_init (job);
4329  }
4330 
4331  SLEEP_MILISEC (0, 10);
4332  }
4333 
4334 #if defined (HB_VERBOSE_DEBUG)
4335  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread exit.\n");
4336 #endif
4337 
4338 #if defined(WINDOWS)
4339  return 0;
4340 #else /* WINDOWS */
4341  return NULL;
4342 #endif /* WINDOWS */
4343 }
4344 
4345 /*
4346  * hb_thread_cluster_reader -
4347  * return: none
4348  *
4349  * arg(in):
4350  */
4351 #if defined(WINDOWS)
4352 static unsigned __stdcall
4353 hb_thread_cluster_reader (void *arg)
4354 #else
4355 static void *
4357 #endif
4358 {
4359  int error;
4360  SOCKET sfd;
4361  char buffer[HB_BUFFER_SZ + MAX_ALIGNMENT], *aligned_buffer;
4362  int len;
4363  struct pollfd po[1] = { {0, 0, 0} };
4364 
4365  struct sockaddr_in from;
4366  socklen_t from_len;
4367 
4368  /* *INDENT-OFF* */
4369  cuberr::context er_context (true);
4370  /* *INDENT-ON* */
4371 
4372 #if defined (HB_VERBOSE_DEBUG)
4373  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread started. (thread:{%s}, tid:%d).\n", __func__, THREAD_ID ());
4374 #endif
4375 
4376  aligned_buffer = PTR_ALIGN (buffer, MAX_ALIGNMENT);
4377  sfd = hb_Cluster->sfd;
4378  while (hb_Cluster->shutdown == false)
4379  {
4380  po[0].fd = sfd;
4381  po[0].events = POLLIN;
4382  error = poll (po, 1, 1);
4383  if (error <= 0)
4384  {
4385  continue;
4386  }
4387 
4388  if ((po[0].revents & POLLIN) && sfd == hb_Cluster->sfd)
4389  {
4390  from_len = sizeof (from);
4391  len = recvfrom (sfd, (void *) aligned_buffer, HB_BUFFER_SZ, 0, (struct sockaddr *) &from, &from_len);
4392  if (len > 0)
4393  {
4394  hb_cluster_receive_heartbeat (aligned_buffer, len, &from, from_len);
4395  }
4396  }
4397  }
4398 
4399 #if defined (HB_VERBOSE_DEBUG)
4400  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread exit.\n");
4401 #endif
4402 
4403 #if defined(WINDOWS)
4404  return 0;
4405 #else /* WINDOWS */
4406  return NULL;
4407 #endif /* WINDOWS */
4408 }
4409 
4410 /*
4411  * hb_thread_resource_worker -
4412  * return: none
4413  *
4414  * arg(in):
4415  */
4416 #if defined(WINDOWS)
4417 static unsigned __stdcall
4418 hb_thread_resource_worker (void *arg)
4419 #else
4420 static void *
4422 #endif
4423 {
4424  HB_JOB_ENTRY *job;
4425  /* *INDENT-OFF* */
4426  cuberr::context er_context (true);
4427  /* *INDENT-ON* */
4428 
4429 #if defined (HB_VERBOSE_DEBUG)
4430  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread started. (thread:{%s}, tid:%d).\n", __func__, THREAD_ID ());
4431 #endif
4432 
4433  while (resource_Jobs->shutdown == false)
4434  {
4435  while ((job = hb_resource_job_dequeue ()) != NULL)
4436  {
4437  job->func (job->arg);
4438  free_and_init (job);
4439  }
4440 
4441  SLEEP_MILISEC (0, 10);
4442  }
4443 
4444 #if defined (HB_VERBOSE_DEBUG)
4445  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread exit.\n");
4446 #endif
4447 
4448 #if defined(WINDOWS)
4449  return 0;
4450 #else /* WINDOWS */
4451  return NULL;
4452 #endif /* WINDOWS */
4453 }
4454 
4455 /*
4456  * hb_thread_resource_worker -
4457  * return: none
4458  *
4459  * arg(in):
4460  */
4461 #if defined(WINDOWS)
4462 static unsigned __stdcall
4463 hb_thread_check_disk_failure (void *arg)
4464 #else
4465 static void *
4467 #endif
4468 {
4469  int rv, error;
4470  int interval;
4471  INT64 remaining_time_msecs = 0;
4472  /* *INDENT-OFF* */
4473  cuberr::context er_context (true);
4474  /* *INDENT-ON* */
4475 
4476 #if defined (HB_VERBOSE_DEBUG)
4477  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread started. (thread:{%s}, tid:%d).\n", __func__, THREAD_ID ());
4478 #endif
4479 
4480  while (hb_Resource->shutdown == false)
4481  {
4483  if (interval > 0 && remaining_time_msecs <= 0)
4484  {
4485 #if !defined(WINDOWS)
4487 #endif /* !WINDOWS */
4488  rv = pthread_mutex_lock (&hb_Cluster->lock);
4489  rv = pthread_mutex_lock (&hb_Resource->lock);
4490 
4491  if (hb_Cluster->is_isolated == false && hb_Resource->state == HB_NSTATE_MASTER)
4492  {
4493  if (hb_resource_check_server_log_grow () == false)
4494  {
4495  /* be silent to avoid blocking write operation on disk */
4497  hb_Resource->state = HB_NSTATE_SLAVE;
4498 
4499  pthread_mutex_unlock (&hb_Resource->lock);
4500  pthread_mutex_unlock (&hb_Cluster->lock);
4501 #if !defined(WINDOWS)
4503 
4504  syslog (LOG_ALERT, "[CUBRID] %s () at %s:%d", __func__, __FILE__, __LINE__);
4505 #endif /* !WINDOWS */
4506 
4508  assert (error == NO_ERROR);
4509 
4510  continue;
4511  }
4512  }
4513 
4514  if (hb_Resource->state == HB_NSTATE_MASTER)
4515  {
4517  }
4518  pthread_mutex_unlock (&hb_Resource->lock);
4519  pthread_mutex_unlock (&hb_Cluster->lock);
4520 #if !defined(WINDOWS)
4522 #endif /* !WINDOWS */
4523 
4524  remaining_time_msecs = interval * 1000;
4525  }
4526 
4528  if (interval > 0)
4529  {
4530  remaining_time_msecs -= HB_DISK_FAILURE_CHECK_TIMER_IN_MSECS;
4531  }
4532  }
4533 
4534 #if defined (HB_VERBOSE_DEBUG)
4535  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "thread exit.\n");
4536 #endif
4537 
4538 #if defined(WINDOWS)
4539  return 0;
4540 #else /* WINDOWS */
4541  return NULL;
4542 #endif /* WINDOWS */
4543 }
4544 
4545 /*
4546  * master heartbeat initializer
4547  */
4548 
4549 /*
4550  * hb_cluster_job_initialize -
4551  * return: NO_ERROR or ER_FAILED
4552  *
4553  */
4554 static int
4556 {
4557  int rv, error;
4558 
4559  if (cluster_Jobs == NULL)
4560  {
4561  cluster_Jobs = (HB_JOB *) malloc (sizeof (HB_JOB));
4562  if (cluster_Jobs == NULL)
4563  {
4565  return ER_OUT_OF_VIRTUAL_MEMORY;
4566  }
4567 
4568  pthread_mutex_init (&cluster_Jobs->lock, NULL);
4569  }
4570 
4571  rv = pthread_mutex_lock (&cluster_Jobs->lock);
4572  cluster_Jobs->shutdown = false;
4573  cluster_Jobs->num_jobs = 0;
4574  cluster_Jobs->jobs = NULL;
4575  cluster_Jobs->job_funcs = &hb_cluster_jobs[0];
4576  pthread_mutex_unlock (&cluster_Jobs->lock);
4577 
4579  if (error != NO_ERROR)
4580  {
4581  assert (false);
4582  return ER_FAILED;
4583  }
4584 
4585  return NO_ERROR;
4586 }
4587 
4588 
4589 /*
4590  * hb_cluster_initialize -
4591  * return: NO_ERROR or ER_FAILED
4592  *
4593  */
4594 static int
4595 hb_cluster_initialize (const char *nodes, const char *replicas)
4596 {
4597  int rv;
4598  struct sockaddr_in udp_saddr;
4599  char host_name[CUB_MAXHOSTNAMELEN];
4600 
4601  if (nodes == NULL)
4602  {
4604 
4605  return ER_PRM_BAD_VALUE;
4606  }
4607 
4608  if (hb_Cluster == NULL)
4609  {
4610  hb_Cluster = (HB_CLUSTER *) malloc (sizeof (HB_CLUSTER));
4611  if (hb_Cluster == NULL)
4612  {
4614  return ER_OUT_OF_VIRTUAL_MEMORY;
4615  }
4616 
4617  pthread_mutex_init (&hb_Cluster->lock, NULL);
4618  }
4619 
4620  if (GETHOSTNAME (host_name, sizeof (host_name)))
4621  {
4624  }
4625 
4626  rv = pthread_mutex_lock (&hb_Cluster->lock);
4627  hb_Cluster->shutdown = false;
4628  hb_Cluster->hide_to_demote = false;
4629  hb_Cluster->is_isolated = false;
4630  hb_Cluster->is_ping_check_enabled = true;
4631  hb_Cluster->sfd = INVALID_SOCKET;
4632  strncpy (hb_Cluster->host_name, host_name, sizeof (hb_Cluster->host_name) - 1);
4633  hb_Cluster->host_name[sizeof (hb_Cluster->host_name) - 1] = '\0';
4634  if (HA_GET_MODE () == HA_MODE_REPLICA)
4635  {
4636  hb_Cluster->state = HB_NSTATE_REPLICA;
4637  }
4638  else
4639  {
4640  hb_Cluster->state = HB_NSTATE_SLAVE;
4641  }
4642  hb_Cluster->master = NULL;
4643  hb_Cluster->myself = NULL;
4644  hb_Cluster->nodes = NULL;
4645 
4646  hb_Cluster->ping_hosts = NULL;
4647  hb_Cluster->ui_nodes = NULL;
4648  hb_Cluster->num_ui_nodes = 0;
4649 
4650  hb_Cluster->num_nodes = hb_cluster_load_group_and_node_list ((char *) nodes, (char *) replicas);
4651  if (hb_Cluster->num_nodes < 1)
4652  {
4653  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_Cluster->num_nodes is smaller than '1'. (num_nodes=%d). \n",
4654  hb_Cluster->num_nodes);
4655  pthread_mutex_unlock (&hb_Cluster->lock);
4656 
4658  return ER_PRM_BAD_VALUE;
4659  }
4660 
4662 
4663  if (hb_cluster_check_valid_ping_server () == false)
4664  {
4665  pthread_mutex_unlock (&hb_Cluster->lock);
4666  return ER_FAILED;
4667  }
4668 
4669 #if defined (HB_VERBOSE_DEBUG)
4670  hb_print_nodes ();
4671 #endif
4672 
4673  /* initialize udp socket */
4674  hb_Cluster->sfd = socket (AF_INET, SOCK_DGRAM, 0);
4675  if (hb_Cluster->sfd < 0)
4676  {
4678  pthread_mutex_unlock (&hb_Cluster->lock);
4680  }
4681 
4682  memset ((void *) &udp_saddr, 0, sizeof (udp_saddr));
4683  udp_saddr.sin_family = AF_INET;
4684  udp_saddr.sin_addr.s_addr = htonl (INADDR_ANY);
4685  udp_saddr.sin_port = htons (prm_get_integer_value (PRM_ID_HA_PORT_ID));
4686 
4687  if (bind (hb_Cluster->sfd, (struct sockaddr *) &udp_saddr, sizeof (udp_saddr)) < 0)
4688  {
4690  pthread_mutex_unlock (&hb_Cluster->lock);
4692  }
4693 
4694  pthread_mutex_unlock (&hb_Cluster->lock);
4695 
4696  return NO_ERROR;
4697 }
4698 
4699 /*
4700  * hb_resource_initialize -
4701  * return: NO_ERROR or ER_FAILED
4702  *
4703  */
4704 static int
4706 {
4707  int rv;
4708 
4709  if (hb_Resource == NULL)
4710  {
4711  hb_Resource = (HB_RESOURCE *) malloc (sizeof (HB_RESOURCE));
4712  if (hb_Resource == NULL)
4713  {
4715  return ER_OUT_OF_VIRTUAL_MEMORY;
4716  }
4717 
4718  pthread_mutex_init (&hb_Resource->lock, NULL);
4719  }
4720 
4721  rv = pthread_mutex_lock (&hb_Resource->lock);
4722  hb_Resource->shutdown = false;
4723  hb_Resource->state = HB_NSTATE_SLAVE;
4724  hb_Resource->num_procs = 0;
4725  hb_Resource->procs = NULL;
4726  pthread_mutex_unlock (&hb_Resource->lock);
4727 
4728  return NO_ERROR;
4729 }
4730 
4731 /*
4732  * hb_resource_job_initialize -
4733  * return: NO_ERROR or ER_FAILED
4734  *
4735  */
4736 static int
4738 {
4739  int rv, error;
4740 
4741  if (resource_Jobs == NULL)
4742  {
4743  resource_Jobs = (HB_JOB *) malloc (sizeof (HB_JOB));
4744  if (resource_Jobs == NULL)
4745  {
4747  return ER_OUT_OF_VIRTUAL_MEMORY;
4748  }
4749 
4750  pthread_mutex_init (&resource_Jobs->lock, NULL);
4751  }
4752 
4753  rv = pthread_mutex_lock (&resource_Jobs->lock);
4754  resource_Jobs->shutdown = false;
4755  resource_Jobs->num_jobs = 0;
4756  resource_Jobs->jobs = NULL;
4757  resource_Jobs->job_funcs = &hb_resource_jobs[0];
4758  pthread_mutex_unlock (&resource_Jobs->lock);
4759 
4760  error =
4764  if (error != NO_ERROR)
4765  {
4766  assert (false);
4767  return ER_FAILED;
4768  }
4769 
4770  return NO_ERROR;
4771 }
4772 
4773 /*
4774  * hb_thread_initialize -
4775  * return: NO_ERROR or ER_FAILED
4776  *
4777  */
4778 static int
4780 {
4781  int rv;
4782 
4783  pthread_attr_t thread_attr;
4784  size_t ts_size;
4785  pthread_t cluster_worker_th;
4786  pthread_t resource_worker_th;
4787  pthread_t check_disk_failure_th;
4788 
4789  rv = pthread_attr_init (&thread_attr);
4790  if (rv != 0)
4791  {
4793  return ER_CSS_PTHREAD_ATTR_INIT;
4794  }
4795 
4796  rv = pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
4797  if (rv != 0)
4798  {
4801  }
4802 
4803 #if defined(AIX)
4804  /* AIX's pthread is slightly different from other systems. Its performance highly depends on the pthread's scope and
4805  * it's related kernel parameters. */
4806  rv =
4807  pthread_attr_setscope (&thread_attr,
4808  prm_get_bool_value (PRM_ID_PTHREAD_SCOPE_PROCESS) ? PTHREAD_SCOPE_PROCESS :
4809  PTHREAD_SCOPE_SYSTEM);
4810 #else /* AIX */
4811  rv = pthread_attr_setscope (&thread_attr, PTHREAD_SCOPE_SYSTEM);
4812 #endif /* AIX */
4813  if (rv != 0)
4814  {
4817  }
4818 
4819  /* Sun Solaris allocates 1M for a thread stack, and it is quite enough */
4820 #if !defined(sun) && !defined(SOLARIS)
4821 #if defined(_POSIX_THREAD_ATTR_STACKSIZE)
4822  rv = pthread_attr_getstacksize (&thread_attr, &ts_size);
4823  if (ts_size < (size_t) prm_get_bigint_value (PRM_ID_THREAD_STACKSIZE))
4824  {
4825  rv = pthread_attr_setstacksize (&thread_attr, prm_get_bigint_value (PRM_ID_THREAD_STACKSIZE));
4826  if (rv != 0)
4827  {
4830  }
4831 
4832  pthread_attr_getstacksize (&thread_attr, &ts_size);
4833  }
4834 #endif /* _POSIX_THREAD_ATTR_STACKSIZE */
4835 #endif /* not sun && not SOLARIS */
4836 
4837 
4838  rv = pthread_create (&cluster_worker_th, &thread_attr, hb_thread_cluster_reader, NULL);
4839  if (rv != 0)
4840  {
4842  return ER_CSS_PTHREAD_CREATE;
4843  }
4844 
4845  rv = pthread_create (&cluster_worker_th, &thread_attr, hb_thread_cluster_worker, NULL);
4846  if (rv != 0)
4847  {
4849  return ER_CSS_PTHREAD_CREATE;
4850  }
4851 
4852  rv = pthread_create (&resource_worker_th, &thread_attr, hb_thread_resource_worker, NULL);
4853  if (rv != 0)
4854  {
4856  return ER_CSS_PTHREAD_CREATE;
4857  }
4858 
4859  rv = pthread_create (&check_disk_failure_th, &thread_attr, hb_thread_check_disk_failure, NULL);
4860  if (rv != 0)
4861  {
4863  return ER_CSS_PTHREAD_CREATE;
4864  }
4865 
4866  /* destroy thread_attribute */
4867  rv = pthread_attr_destroy (&thread_attr);
4868  if (rv != 0)
4869  {
4872  }
4873 
4874  return NO_ERROR;
4875 }
4876 
4877 /*
4878  * hb_master_init -
4879  * return: NO_ERROR or ER_FAILED,...
4880  *
4881  */
4882 int
4884 {
4885  int error;
4886 
4887  hb_enable_er_log ();
4888 
4890 #if defined (HB_VERBOSE_DEBUG)
4891  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "heartbeat params. (ha_mode:%s, heartbeat_nodes:{%s}" ", ha_port_id:%d). \n",
4892  (!HA_DISABLED ())? "yes" : "no",
4894 #endif
4895 
4897  error =
4899  if (error != NO_ERROR)
4900  {
4901  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_cluster_initialize failed. " "(error=%d). \n", error);
4902  util_log_write_errstr ("%s\n", db_error_string (3));
4903  goto error_return;
4904  }
4905 
4906  error = hb_cluster_job_initialize ();
4907  if (error != NO_ERROR)
4908  {
4909  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_cluster_job_initialize failed. " "(error=%d). \n", error);
4910  util_log_write_errstr ("%s\n", db_error_string (3));
4911  goto error_return;
4912  }
4913 
4914  error = hb_resource_initialize ();
4915  if (error != NO_ERROR)
4916  {
4917  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_resource_initialize failed. " "(error=%d). \n", error);
4918  util_log_write_errstr ("%s\n", db_error_string (3));
4919  goto error_return;
4920  }
4921 
4922  error = hb_resource_job_initialize ();
4923  if (error != NO_ERROR)
4924  {
4925  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_resource_job_initialize failed. " "(error=%d). \n", error);
4926  util_log_write_errstr ("%s\n", db_error_string (3));
4927  goto error_return;
4928  }
4929 
4930  error = hb_thread_initialize ();
4931  if (error != NO_ERROR)
4932  {
4933  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "hb_thread_initialize failed. " "(error=%d). \n", error);
4934  util_log_write_errstr ("%s\n", db_error_string (3));
4935  goto error_return;
4936  }
4937 
4938  hb_Deactivate_immediately = false;
4939 
4940  return NO_ERROR;
4941 
4942 error_return:
4943  if (hb_Cluster && hb_Cluster->shutdown == false)
4944  {
4945  hb_cluster_cleanup ();
4946  }
4947 
4948  if (cluster_Jobs && cluster_Jobs->shutdown == false)
4949  {
4951  }
4952 
4953  if (hb_Resource && hb_Resource->shutdown == false)
4954  {
4956  }
4957 
4958  if (resource_Jobs && resource_Jobs->shutdown == false)
4959  {
4961  }
4962 
4963  return error;
4964 }
4965 
4966 /*
4967  * terminator
4968  */
4969 
4970 /*
4971  * hb_resource_shutdown_all_ha_procs() -
4972  * return:
4973  *
4974  */
4975 static void
4977 {
4978  HB_PROC_ENTRY *proc;
4979  SOCKET_QUEUE_ENTRY *sock_ent;
4980  char buffer[MASTER_TO_SRV_MSG_SIZE];
4981 
4982  /* set process state to deregister and close connection */
4983  for (proc = hb_Resource->procs; proc; proc = proc->next)
4984  {
4985  if (proc->conn)
4986  {
4987  if (proc->type != HB_PTYPE_SERVER)
4988  {
4989 #if defined (HB_VERBOSE_DEBUG)
4990  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "remove socket-queue entry. (pid:%d). \n", proc->pid);
4991 #endif
4993  proc->conn = NULL;
4994  proc->sfd = INVALID_SOCKET;
4995  }
4996  else
4997  {
4998  /* In case of HA server, just send shutdown request */
5000  assert_release (sock_ent == NULL || sock_ent->name != NULL);
5001  if (sock_ent != NULL && sock_ent->name != NULL)
5002  {
5003  memset (buffer, 0, sizeof (buffer));
5004  snprintf (buffer, sizeof (buffer) - 1,
5006  sock_ent->name + 1, 0);
5007 
5008  css_process_start_shutdown (sock_ent, 0, buffer);
5009  }
5010  else
5011  {
5012  proc->conn = NULL;
5013  proc->sfd = INVALID_SOCKET;
5014  }
5015  }
5016  }
5017  else
5018  {
5019  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "invalid socket-queue entry. (pid:%d).\n", proc->pid);
5020  }
5021 
5022  proc->state = HB_PSTATE_DEREGISTERED;
5023  }
5024 
5025  return;
5026 }
5027 
5028 /*
5029  * hb_resource_cleanup() -
5030  * return:
5031  *
5032  */
5033 static void
5035 {
5036  HB_PROC_ENTRY *proc;
5037 
5038  pthread_mutex_lock (&hb_Resource->lock);
5039 
5041 
5042  for (proc = hb_Resource->procs; proc; proc = proc->next)
5043  {
5044  if (proc->conn && proc->pid > 0)
5045  {
5046  kill (proc->pid, SIGKILL);
5047  }
5048  }
5049 
5050  hb_remove_all_procs (hb_Resource->procs);
5051  hb_Resource->procs = NULL;
5052  hb_Resource->num_procs = 0;
5053  hb_Resource->state = HB_NSTATE_UNKNOWN;
5054  hb_Resource->shutdown = true;
5055  pthread_mutex_unlock (&hb_Resource->lock);
5056 
5057  return;
5058 }
5059 
5060 /*
5061  * hb_resource_shutdown_and_cleanup() -
5062  * return:
5063  *
5064  */
5065 void
5067 {
5070  return;
5071 }
5072 
5073 /*
5074  * hb_cluster_cleanup() -
5075  * return:
5076  *
5077  */
5078 static void
5080 {
5081  int rv;
5082  HB_NODE_ENTRY *node;
5083 
5084  rv = pthread_mutex_lock (&hb_Cluster->lock);
5085  hb_Cluster->state = HB_NSTATE_UNKNOWN;
5086 
5087  for (node = hb_Cluster->nodes; node; node = node->next)
5088  {
5089  if (are_hostnames_equal (hb_Cluster->host_name, node->host_name))
5090  {
5091  continue;
5092  }
5093 
5095  node->heartbeat_gap++;
5096  }
5097 
5098  hb_cluster_remove_all_nodes (hb_Cluster->nodes);
5099  hb_Cluster->nodes = NULL;
5100  hb_Cluster->master = NULL;
5101  hb_Cluster->myself = NULL;
5102  hb_Cluster->shutdown = true;
5103  if (hb_Cluster->sfd != INVALID_SOCKET)
5104  {
5105  close (hb_Cluster->sfd);
5106  hb_Cluster->sfd = INVALID_SOCKET;
5107  }
5108 
5110  hb_Cluster->ping_hosts = NULL;
5111  hb_Cluster->num_ping_hosts = 0;
5112 
5114  hb_Cluster->ui_nodes = NULL;
5115  hb_Cluster->num_ui_nodes = 0;
5116 
5117  pthread_mutex_unlock (&hb_Cluster->lock);
5118 
5119  return;
5120 }
5121 
5122 /*
5123  * hb_cluster_cleanup() -
5124  * return:
5125  *
5126  */
5127 void
5129 {
5131  hb_cluster_cleanup ();
5132 }
5133 
5134 /*
5135  * hb_process_state_string -
5136  * return: process state sring
5137  *
5138  * ptype(in):
5139  * pstate(in):
5140  */
5141 const char *
5142 hb_process_state_string (unsigned char ptype, int pstate)
5143 {
5144  switch (pstate)
5145  {
5146  case HB_PSTATE_UNKNOWN:
5147  return HB_PSTATE_UNKNOWN_STR;
5148  case HB_PSTATE_DEAD:
5149  return HB_PSTATE_DEAD_STR;
5152  case HB_PSTATE_STARTED:
5153  return HB_PSTATE_STARTED_STR;
5156  case HB_PSTATE_REGISTERED:
5157  if (ptype == HB_PTYPE_SERVER)
5158  {
5160  }
5161  else
5162  {
5163  return HB_PSTATE_REGISTERED_STR;
5164  }
5171  }
5172 
5173  return "invalid";
5174 }
5175 
5176 /*
5177  * hb_ping_result_string -
5178  * return: ping result string
5179  *
5180  * ping_result(in):
5181  */
5182 const char *
5183 hb_ping_result_string (int ping_result)
5184 {
5185  switch (ping_result)
5186  {
5187  case HB_PING_UNKNOWN:
5188  return HB_PING_UNKNOWN_STR;
5189  case HB_PING_SUCCESS:
5190  return HB_PING_SUCCESS_STR;
5191  case HB_PING_USELESS_HOST:
5192  return HB_PING_USELESS_HOST_STR;
5193  case HB_PING_SYS_ERR:
5194  return HB_PING_SYS_ERR_STR;
5195  case HB_PING_FAILURE:
5196  return HB_PING_FAILURE_STR;
5197  }
5198 
5199  return "invalid";
5200 }
5201 
5202 /*
5203  * hb_reload_config -
5204  * return: NO_ERROR or ER_FAILED
5205  *
5206  */
5207 static int
5209 {
5210  int rv, old_num_nodes, old_num_ping_hosts, error;
5211  HB_NODE_ENTRY *old_nodes;
5212  HB_NODE_ENTRY *old_node, *old_myself, *old_master, *new_node;
5213  HB_PING_HOST_ENTRY *old_ping_hosts;
5214 
5215  if (hb_Cluster == NULL)
5216  {
5217  return ER_FAILED;
5218  }
5219 
5221 
5222 #if defined (HB_VERBOSE_DEBUG)
5223  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "reload configuration. (nodes:{%s}).\n",
5225 #endif
5226 
5228  {
5229  return ER_FAILED;
5230  }
5231 
5232  rv = pthread_mutex_lock (&hb_Cluster->lock);
5233 
5234  /* backup old ping hosts */
5235  hb_list_move ((HB_LIST **) (&old_ping_hosts), (HB_LIST **) (&hb_Cluster->ping_hosts));
5236  old_num_ping_hosts = hb_Cluster->num_ping_hosts;
5237 
5238  hb_Cluster->ping_hosts = NULL;
5239 
5240  /* backup old node list */
5241  hb_list_move ((HB_LIST **) (&old_nodes), (HB_LIST **) (&hb_Cluster->nodes));
5242  old_myself = hb_Cluster->myself;
5243  old_master = hb_Cluster->master;
5244  old_num_nodes = hb_Cluster->num_nodes;
5245 
5246  hb_Cluster->nodes = NULL;
5247 
5248  /* reload ping hosts */
5250 
5251  if (hb_cluster_check_valid_ping_server () == false)
5252  {
5253  error = ER_FAILED;
5254  goto reconfig_error;
5255  }
5256 
5257  /* reload node list */
5258  hb_Cluster->num_nodes =
5261 
5262  if (hb_Cluster->num_nodes < 1
5263  || (hb_Cluster->master && hb_return_node_by_name (hb_Cluster->master->host_name) == NULL))
5264  {
5266  error = ER_PRM_BAD_VALUE;
5267  goto reconfig_error;
5268  }
5269 
5270  for (new_node = hb_Cluster->nodes; new_node; new_node = new_node->next)
5271  {
5272 #if defined (HB_VERBOSE_DEBUG)
5273  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "reloaded nodes list. (nodes:{%s}).\n", new_node->host_name);
5274 #endif
5275  for (old_node = old_nodes; old_node; old_node = old_node->next)
5276  {
5277  if (!are_hostnames_equal (new_node->host_name, old_node->host_name))
5278  {
5279  continue;
5280  }
5281  if (old_master && are_hostnames_equal (new_node->host_name, old_master->host_name))
5282  {
5283  hb_Cluster->master = new_node;
5284  }
5285  new_node->state = old_node->state;
5286  new_node->score = old_node->score;
5287  new_node->heartbeat_gap = old_node->heartbeat_gap;
5288  new_node->last_recv_hbtime.tv_sec = old_node->last_recv_hbtime.tv_sec;
5289  new_node->last_recv_hbtime.tv_usec = old_node->last_recv_hbtime.tv_usec;
5290 
5291  /* mark node wouldn't deregister */
5292  old_node->host_name[0] = '\0';
5293  }
5294  }
5295 
5297 
5298  /* clean up ping host backup */
5299  if (old_ping_hosts != NULL)
5300  {
5301  hb_cluster_remove_all_ping_hosts (old_ping_hosts);
5302  }
5303 
5304  /* clean up node list backup */
5305  if (old_nodes)
5306  {
5307  hb_cluster_remove_all_nodes (old_nodes);
5308  }
5309  pthread_mutex_unlock (&hb_Cluster->lock);
5310 
5311  return NO_ERROR;
5312 
5313 reconfig_error:
5314  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "reconfigure heartebat failed. " "(num_nodes:%d, master:{%s}).\n",
5315  hb_Cluster->num_nodes, (hb_Cluster->master) ? hb_Cluster->master->host_name : "-");
5316 
5317 /* restore ping hosts */
5318  hb_Cluster->num_ping_hosts = old_num_ping_hosts;
5319 
5321 
5322  hb_list_move ((HB_LIST **) (&hb_Cluster->ping_hosts), (HB_LIST **) (&old_ping_hosts));
5323 
5324  /* restore node list */
5325  hb_cluster_remove_all_nodes (hb_Cluster->nodes);
5326  hb_Cluster->myself = old_myself;
5327  hb_Cluster->master = old_master;
5328  hb_Cluster->num_nodes = old_num_nodes;
5329 
5330  hb_list_move ((HB_LIST **) (&hb_Cluster->nodes), (HB_LIST **) (&old_nodes));
5331 
5332  pthread_mutex_unlock (&hb_Cluster->lock);
5333 
5334  return error;
5335 }
5336 
5337 #if defined (ENABLE_UNUSED_FUNCTION)
5338 static void
5339 hb_deregister_nodes (char *node_to_dereg)
5340 {
5341  const char *delim = ":";
5342  int error;
5343  HB_PROC_ENTRY *proc;
5344  HB_JOB_ARG *job_arg;
5345  char *p, *savep;
5346  char *node_name;
5347  char *log_path;
5348 
5349  for (p = strtok_r (node_to_dereg, delim, &savep); p; p = strtok_r (NULL, delim, &savep))
5350  {
5351 
5352  (void) pthread_mutex_lock (&hb_Resource->lock);
5353  for (proc = hb_Resource->procs; proc; proc = proc->next)
5354  {
5355  if (proc->type == HB_PTYPE_SERVER)
5356  {
5357  continue;
5358  }
5359 
5360  job_arg = NULL;
5361 
5362  log_path = proc->argv[3];
5363  node_name = strrchr (log_path, '_');
5364  if (node_name)
5365  {
5366  node_name++;
5367  if (strncmp (node_name, p, strlen (p)) == 0)
5368  {
5369  job_arg = hb_deregister_process (proc);
5370  }
5371  }
5372  if (job_arg)
5373  {
5375  if (error != NO_ERROR)
5376  {
5377  assert (false);
5378  free_and_init (job_arg);
5379  }
5380  }
5381  }
5382  (void) pthread_mutex_unlock (&hb_Resource->lock);
5383  }
5384 
5385  return;
5386 }
5387 #endif /* ENABLE_UNUSED_FUNCTION */
5388 
5389 /*
5390  * hb_get_admin_info_string -
5391  * return: none
5392  *
5393  * str(out):
5394  */
5395 void
5397 {
5398  int rv, buf_size = 0;
5399  char *p, *last;
5400 
5401  if (*str)
5402  {
5403  **str = 0;
5404  return;
5405  }
5406 
5408 
5409  if (css_Master_er_log_enabled == true || hb_Nolog_event_msg[0] == '\0')
5410  {
5412  return;
5413  }
5414 
5415  buf_size = strlen (HA_ADMIN_INFO_FORMAT_STRING);
5418  buf_size += strlen (hb_Nolog_event_msg);
5419 
5420  *str = (char *) malloc (sizeof (char) * buf_size);
5421  if (*str == NULL)
5422  {
5424  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, sizeof (char) * buf_size);
5425  return;
5426  }
5427  **str = '\0';
5428 
5429  p = (char *) (*str);
5430  last = p + buf_size;
5431 
5432  p += snprintf (p, MAX ((last - p), 0), HA_ADMIN_INFO_FORMAT_STRING);
5433  p += snprintf (p, MAX ((last - p), 0), HA_ADMIN_INFO_NOLOG_FORMAT_STRING);
5434  p += snprintf (p, MAX ((last - p), 0), HA_ADMIN_INFO_NOLOG_EVENT_FORMAT_STRING, hb_Nolog_event_msg);
5435 
5437 
5438  return;
5439 }
5440 
5441 /*
5442  * hb_get_ping_host_info_string -
5443  * return: none
5444  *
5445  * str(out):
5446  */
5447 void
5449 {
5450  int rv, buf_size = 0, required_size = 0;
5451  char *p, *last;
5452  bool valid_ping_host_exists;
5453  bool is_ping_check_enabled = true;
5454  HB_PING_HOST_ENTRY *ping_host;
5455 
5456  if (hb_Cluster == NULL)
5457  {
5458  return;
5459  }
5460 
5461  if (*str)
5462  {
5463  **str = 0;
5464  return;
5465  }
5466 
5467  rv = pthread_mutex_lock (&hb_Cluster->lock);
5468 
5469  if (hb_Cluster->num_ping_hosts == 0)
5470  {
5471  pthread_mutex_unlock (&hb_Cluster->lock);
5472  return;
5473  }
5474 
5475  /* refresh ping host info */
5476  valid_ping_host_exists = hb_cluster_check_valid_ping_server ();
5477 
5478  if (valid_ping_host_exists == false && hb_cluster_is_isolated () == false)
5479  {
5480  is_ping_check_enabled = false;
5481  }
5482 
5483  if (is_ping_check_enabled != hb_Cluster->is_ping_check_enabled)
5484  {
5486  }
5487 
5488  required_size = strlen (HA_PING_HOSTS_INFO_FORMAT_STRING);
5489  required_size += 7; /* length of ping check status */
5490 
5491  buf_size += required_size;
5492 
5493  required_size = strlen (HA_PING_HOSTS_FORMAT_STRING);
5494  required_size += CUB_MAXHOSTNAMELEN;
5495  required_size += HB_PING_STR_SIZE; /* length of ping test result */
5496  required_size *= hb_Cluster->num_ping_hosts;
5497 
5498  buf_size += required_size;
5499 
5500  *str = (char *) malloc (sizeof (char) * buf_size);
5501  if (*str == NULL)
5502  {
5503  pthread_mutex_unlock (&hb_Cluster->lock);
5504  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, sizeof (char) * buf_size);
5505  return;
5506  }
5507  **str = '\0';
5508 
5509  p = (char *) (*str);
5510  last = p + buf_size;
5511 
5512  p +=
5513  snprintf (p, MAX ((last - p), 0), HA_PING_HOSTS_INFO_FORMAT_STRING, is_ping_check_enabled ? "enabled" : "disabled");
5514 
5515  for (ping_host = hb_Cluster->ping_hosts; ping_host; ping_host = ping_host->next)
5516  {
5517  p +=
5518  snprintf (p, MAX ((last - p), 0), HA_PING_HOSTS_FORMAT_STRING, ping_host->host_name,
5519  hb_ping_result_string (ping_host->ping_result));
5520  }
5521 
5522  pthread_mutex_unlock (&hb_Cluster->lock);
5523 
5524  return;
5525 }
5526 
5527 /*
5528  * hb_get_node_info_string -
5529  * return: none
5530  *
5531  * str(out):
5532  * verbose_yn(in):
5533  */
5534 void
5535 hb_get_node_info_string (char **str, bool verbose_yn)
5536 {
5537  HB_NODE_ENTRY *node;
5538  HB_UI_NODE_ENTRY *ui_node;
5539  int rv, buf_size = 0, required_size = 0;
5540  char *p, *last;
5541  struct timeval now;
5542  char *ipv4_p;
5543  char ipv4_str[HB_IPV4_STR_LEN];
5544 
5545  if (hb_Cluster == NULL)
5546  {
5547  return;
5548  }
5549 
5550  if (*str)
5551  {
5552  **str = 0;
5553  return;
5554  }
5555 
5556  required_size = strlen (HA_NODE_INFO_FORMAT_STRING);
5557  required_size += CUB_MAXHOSTNAMELEN; /* length of node name */
5558  required_size += HB_NSTATE_STR_SZ; /* length of node state */
5559  buf_size += required_size;
5560 
5561  required_size = strlen (HA_NODE_FORMAT_STRING);
5562  required_size += CUB_MAXHOSTNAMELEN; /* length of node name */
5563  required_size += 5; /* length of priority */
5564  required_size += HB_NSTATE_STR_SZ; /* length of node state */
5565  if (verbose_yn)
5566  {
5567  required_size += strlen (HA_NODE_SCORE_FORMAT_STRING);
5568  required_size += 6; /* length of score */
5569  required_size += strlen (HA_NODE_HEARTBEAT_GAP_FORMAT_STRING);
5570  required_size += 6; /* length of missed heartbeat */
5571  }
5572 
5573  rv = pthread_mutex_lock (&hb_Cluster->lock);
5574 
5575  required_size *= hb_Cluster->num_nodes;
5576  buf_size += required_size;
5577 
5578  /* unidentifed node info */
5579  required_size = strlen (HA_UI_NODE_FORMAT_STRING);
5580  required_size += HB_IPV4_STR_LEN;
5581  required_size += HB_MAX_GROUP_ID_LEN;
5582  required_size += HB_NSTATE_STR_SZ;
5583 
5584  required_size *= hb_Cluster->num_ui_nodes;
5585 
5586  buf_size += required_size;
5587 
5588  *str = (char *) malloc (sizeof (char) * buf_size);
5589  if (*str == NULL)
5590  {
5591  pthread_mutex_unlock (&hb_Cluster->lock);
5592  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, sizeof (char) * buf_size);
5593  return;
5594  }
5595  **str = '\0';
5596 
5597  p = (char *) (*str);
5598  last = p + buf_size;
5599 
5600  p +=
5601  snprintf (p, MAX ((last - p), 0), HA_NODE_INFO_FORMAT_STRING, hb_Cluster->host_name,
5602  hb_node_state_string (hb_Cluster->state));
5603 
5604  for (node = hb_Cluster->nodes; node; node = node->next)
5605  {
5606  p +=
5607  snprintf (p, MAX ((last - p), 0), HA_NODE_FORMAT_STRING, node->host_name, node->priority,
5608  hb_node_state_string (node->state));
5609  if (verbose_yn)
5610  {
5611  p += snprintf (p, MAX ((last - p), 0), HA_NODE_SCORE_FORMAT_STRING, node->score);
5612  p += snprintf (p, MAX ((last - p), 0), HA_NODE_HEARTBEAT_GAP_FORMAT_STRING, node->heartbeat_gap);
5613  }
5614  }
5615 
5616  hb_cleanup_ui_nodes (hb_Cluster->ui_nodes);
5617  gettimeofday (&now, NULL);
5618  for (ui_node = hb_Cluster->ui_nodes; ui_node; ui_node = ui_node->next)
5619  {
5621  {
5622  continue;
5623  }
5624 
5625  ipv4_p = (char *) &ui_node->saddr.sin_addr.s_addr;
5626  snprintf (ipv4_str, sizeof (ipv4_str), "%u.%u.%u.%u", (unsigned char) ipv4_p[0], (unsigned char) ipv4_p[1],
5627  (unsigned char) ipv4_p[2], (unsigned char) ipv4_p[3]);
5628  p +=
5629  snprintf (p, MAX ((last - p), 0), HA_UI_NODE_FORMAT_STRING, ui_node->host_name, ipv4_str, ui_node->group_id,
5630  hb_valid_result_string (ui_node->v_result));
5631  }
5632 
5633  pthread_mutex_unlock (&hb_Cluster->lock);
5634  return;
5635 }
5636 
5637 /*
5638  * hb_get_process_info_string -
5639  * return: none
5640  *
5641  * str(out):
5642  * verbose_yn(in):
5643  */
5644 void
5645 hb_get_process_info_string (char **str, bool verbose_yn)
5646 {
5647  HB_PROC_ENTRY *proc;
5648  SOCKET_QUEUE_ENTRY *sock_entq;
5649  int rv, buf_size = 0, required_size = 0;
5650  char *p, *last;
5651  char time_str[64];
5652 
5653  if (hb_Resource == NULL)
5654  {
5655  return;
5656  }
5657 
5658  if (*str)
5659  {
5660  **str = 0;
5661  return;
5662  }
5663 
5664  required_size = strlen (HA_PROCESS_INFO_FORMAT_STRING);
5665  required_size += 10; /* length of pid */
5666  required_size += HB_NSTATE_STR_SZ; /* length of node state */
5667  buf_size += required_size;
5668 
5669  required_size = strlen (HA_APPLYLOG_PROCESS_FORMAT_STRING);
5670  required_size += 256; /* length of connection name */
5671  required_size += 10; /* length of pid */
5672  required_size += HB_PSTATE_STR_SZ; /* length of process state */
5673 
5674  if (verbose_yn)
5675  {
5676  required_size += strlen (HA_PROCESS_EXEC_PATH_FORMAT_STRING);
5677  required_size += HB_MAX_SZ_PROC_EXEC_PATH;
5678  required_size += strlen (HA_PROCESS_ARGV_FORMAT_STRING);
5679  required_size += HB_MAX_SZ_PROC_ARGS;
5681  required_size += 64; /* length of registered time */
5683  required_size += 64; /* length of deregistered time */
5685  required_size += 64; /* length of shutdown time */
5686  required_size += strlen (HA_PROCESS_START_TIME_FORMAT_STRING);
5687  required_size += 64; /* length of start time */
5688  }
5689 
5690  rv = pthread_mutex_lock (&hb_Resource->lock);
5691 
5692  required_size *= hb_Resource->num_procs;
5693  buf_size += required_size;
5694 
5695  *str = (char *) malloc (sizeof (char) * buf_size);
5696  if (*str == NULL)
5697  {
5698  pthread_mutex_unlock (&hb_Resource->lock);
5699  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_OUT_OF_VIRTUAL_MEMORY, 1, sizeof (char) * buf_size);
5700  return;
5701  }
5702  **str = '\0';
5703 
5704  p = (char *) (*str);
5705  last = p + buf_size;
5706 
5707  p +=
5708  snprintf (p, MAX ((last - p), 0), HA_PROCESS_INFO_FORMAT_STRING, getpid (),
5709  hb_node_state_string (hb_Resource->state));
5710 
5711  for (proc = hb_Resource->procs; proc; proc = proc->next)
5712  {
5714  assert_release (sock_entq == NULL || sock_entq->name != NULL);
5715  if (sock_entq == NULL || sock_entq->name == NULL)
5716  {
5717  continue;
5718  }
5719 
5720  switch (proc->type)
5721  {
5722  case HB_PTYPE_SERVER:
5723  p +=
5724  snprintf (p, MAX ((last - p), 0), HA_SERVER_PROCESS_FORMAT_STRING, sock_entq->name + 1, proc->pid,
5725  hb_process_state_string (proc->type, proc->state));
5726  break;
5727  case HB_PTYPE_COPYLOGDB:
5728  p +=
5729  snprintf (p, MAX ((last - p), 0), HA_COPYLOG_PROCESS_FORMAT_STRING, sock_entq->name + 1, proc->pid,
5730  hb_process_state_string (proc->type, proc->state));
5731  break;
5732  case HB_PTYPE_APPLYLOGDB:
5733  p +=
5734  snprintf (p, MAX ((last - p), 0), HA_APPLYLOG_PROCESS_FORMAT_STRING, sock_entq->name + 1, proc->pid,
5735  hb_process_state_string (proc->type, proc->state));
5736  break;
5737  default:
5738  break;
5739  }
5740 
5741  if (verbose_yn)
5742  {
5743  p += snprintf (p, MAX ((last - p), 0), HA_PROCESS_EXEC_PATH_FORMAT_STRING, proc->exec_path);
5744  p += snprintf (p, MAX ((last - p), 0), HA_PROCESS_ARGV_FORMAT_STRING, proc->args);
5745  p +=
5746  snprintf (p, MAX ((last - p), 0), HA_PROCESS_REGISTER_TIME_FORMAT_STRING,
5747  hb_strtime (time_str, sizeof (time_str), &proc->rtime));
5748  p +=
5749  snprintf (p, MAX ((last - p), 0), HA_PROCESS_DEREGISTER_TIME_FORMAT_STRING,
5750  hb_strtime (time_str, sizeof (time_str), &proc->dtime));
5751  p +=
5752  snprintf (p, MAX ((last - p), 0), HA_PROCESS_SHUTDOWN_TIME_FORMAT_STRING,
5753  hb_strtime (time_str, sizeof (time_str), &proc->ktime));
5754  p +=
5755  snprintf (p, MAX ((last - p), 0), HA_PROCESS_START_TIME_FORMAT_STRING,
5756  hb_strtime (time_str, sizeof (time_str), &proc->stime));
5757  }
5758  }
5759 
5760  pthread_mutex_unlock (&hb_Resource->lock);
5761 
5762  return;
5763 }
5764 
5765 /*
5766  * hb_kill_process - kill a list of processes
5767  * return: none
5768  *
5769  */
5770 static void
5771 hb_kill_process (pid_t * pids, int count)
5772 {
5773  int error;
5774  int i = 0, j = 0;
5775  int max_retries, wait_time_in_secs;
5776  int signum = SIGTERM;
5777  bool finished;
5778 
5779  max_retries = 20;
5780  wait_time_in_secs = 3;
5781  for (i = 0; i < max_retries; i++)
5782  {
5783  finished = true;
5784  for (j = 0; j < count; j++)
5785  {
5786  if (pids[j] > 0)
5787  {
5788  error = kill (pids[j], signum);
5789  if (error && errno == ESRCH)
5790  {
5791  pids[j] = 0;
5792  }
5793  else
5794  {
5795  finished = false;
5796  }
5797  }
5798  }
5799  if (finished == true)
5800  {
5801  return;
5802  }
5803  signum = 0;
5804  SLEEP_MILISEC (wait_time_in_secs, 0);
5805  }
5806 
5807  for (j = 0; j < count; j++)
5808  {
5809  if (pids[j] > 0)
5810  {
5811  kill (pids[j], SIGKILL);
5812  }
5813  }
5814 
5815  return;
5816 }
5817 
5818 /*
5819  * hb_kill_all_heartbeat_process -
5820  * return: none
5821  *
5822  * str(out):
5823  */
5824 void
5826 {
5827  int rv, count, i;
5828  pid_t *pids;
5829  HB_PROC_ENTRY *proc;
5830  size_t size;
5831 
5832  if (hb_Resource == NULL)
5833  {
5834  return;
5835  }
5836 
5837  count = 0;
5838  pids = NULL;
5839 
5840  rv = pthread_mutex_lock (&hb_Resource->lock);
5841  for (proc = hb_Resource->procs; proc; proc = proc->next)
5842  {
5843  if (proc->type == HB_PTYPE_APPLYLOGDB || proc->type == HB_PTYPE_COPYLOGDB)
5844  {
5845  size = sizeof (pid_t) * (count + 1);
5846  pids = (pid_t *) realloc (pids, size);
5847  if (pids == NULL)
5848  {
5849  pthread_mutex_unlock (&hb_Resource->lock);
5851  return;
5852  }
5853  pids[count] = proc->pid;
5854  count++;
5855  }
5856  }
5857  pthread_mutex_unlock (&hb_Resource->lock);
5858 
5859  for (i = 0; i < count; i++)
5860  {
5861  hb_deregister_by_pid (pids[i]);
5862  }
5863 
5864  free (pids);
5865 }
5866 
5867 /*
5868  * hb_deregister_by_pid -
5869  * return: none
5870  *
5871  * pid(in):
5872  * str(out):
5873  */
5874 void
5876 {
5877  int error = NO_ERROR;
5878  HB_PROC_ENTRY *proc;
5879  HB_JOB_ARG *job_arg;
5880  char error_string[LINE_MAX] = "";
5881 
5882  if (hb_Resource == NULL)
5883  {
5884  return;
5885  }
5886 
5887  (void) pthread_mutex_lock (&hb_Resource->lock);
5888  proc = hb_return_proc_by_pid (pid);
5889  if (proc == NULL)
5890  {
5891  (void) pthread_mutex_unlock (&hb_Resource->lock);
5892  snprintf (error_string, LINE_MAX, "%s. (cannot find process to deregister, pid:%d)", HB_RESULT_FAILURE_STR, pid);
5894  return;
5895  }
5896 
5897  job_arg = hb_deregister_process (proc);
5898  (void) pthread_mutex_unlock (&hb_Resource->lock);
5899 
5900  if (job_arg)
5901  {
5903  if (error != NO_ERROR)
5904  {
5905  assert (false);
5906  free_and_init (job_arg);
5907  }
5908  }
5909 
5910  snprintf (error_string, LINE_MAX, "%s. (pid:%d)", HB_RESULT_SUCCESS_STR, pid);
5912  error_string);
5913 
5914  return;
5915 }
5916 
5917 /*
5918  * hb_deregister_by_args -
5919  * return: none
5920  *
5921  * args(in):
5922  * str(out):
5923  */
5924 void
5926 {
5927  int error = NO_ERROR;
5928  HB_PROC_ENTRY *proc;
5929  HB_JOB_ARG *job_arg;
5930  char error_string[LINE_MAX] = "";
5931 
5932  if (hb_Resource == NULL)
5933  {
5934  return;
5935  }
5936 
5937  (void) pthread_mutex_lock (&hb_Resource->lock);
5938  proc = hb_return_proc_by_args (args);
5939  if (proc == NULL)
5940  {
5941  (void) pthread_mutex_unlock (&hb_Resource->lock);
5942  snprintf (error_string, LINE_MAX, "%s. (cannot find process to deregister, args:%s)", HB_RESULT_FAILURE_STR,
5943  args);
5945  error_string);
5946  return;
5947  }
5948 
5949  job_arg = hb_deregister_process (proc);
5950  (void) pthread_mutex_unlock (&hb_Resource->lock);
5951 
5952  if (job_arg)
5953  {
5955  if (error != NO_ERROR)
5956  {
5957  assert (false);
5958  free_and_init (job_arg);
5959  }
5960  }
5961 
5962  snprintf (error_string, LINE_MAX, "%s. (args:%s)", HB_RESULT_SUCCESS_STR, args);
5964  error_string);
5965 
5966  return;
5967 }
5968 
5969 static HB_JOB_ARG *
5971 {
5972  HB_JOB_ARG *job_arg;
5973  HB_RESOURCE_JOB_ARG *proc_arg;
5974  char error_string[LINE_MAX] = "";
5975 
5976  if ((proc->state < HB_PSTATE_DEAD) || (proc->state >= HB_PSTATE_MAX) || (proc->pid < 0))
5977  {
5978  snprintf (error_string, LINE_MAX, "%s. (unexpected process status or invalid pid, status:%d, pid:%d)",
5979  HB_RESULT_FAILURE_STR, proc->state, proc->pid);
5981  return NULL;
5982  }
5983 
5984  gettimeofday (&proc->dtime, NULL);
5985 
5986  job_arg = (HB_JOB_ARG *) malloc (sizeof (HB_JOB_ARG));
5987  if (job_arg == NULL)
5988  {
5990  return NULL;
5991  }
5992 
5993  proc_arg = &(job_arg->resource_job_arg);
5994  proc_arg->pid = proc->pid;
5995  memcpy ((void *) &proc_arg->args[0], proc->args, sizeof (proc_arg->args));
5996  proc_arg->retries = 0;
5998  gettimeofday (&proc_arg->ftime, NULL);
5999 
6000  proc->state = HB_PSTATE_DEREGISTERED;
6001 
6002  return job_arg;
6003 }
6004 
6005 /*
6006  * hb_reconfig_heartbeat -
6007  * return: none
6008  *
6009  * str(out):
6010  */
6011 void
6013 {
6014  int error;
6015  char error_string[LINE_MAX] = "";
6016 
6017  error = hb_reload_config ();
6018  if (error)
6019  {
6020  snprintf (error_string, LINE_MAX, "%s. (failed to reload CUBRID heartbeat configuration)", HB_RESULT_FAILURE_STR);
6022  *str = NULL;
6023  }
6024  else
6025  {
6026  snprintf (error_string, LINE_MAX, "%s.", HB_RESULT_SUCCESS_STR);
6028  hb_get_node_info_string (str, false);
6029 
6030  snprintf (error_string, LINE_MAX, "\n%s", (str && *str) ? *str : "");
6032  }
6033 
6034  return;
6035 }
6036 
6037 /*
6038  * hb_prepare_deactivate_heartbeat - shutdown all HA processes
6039  * to deactivate heartbeat
6040  * return:
6041  *
6042  */
6043 int
6045 {
6046  int rv, error = NO_ERROR;
6047  char error_string[LINE_MAX] = "";
6048 
6049  if (hb_Cluster == NULL || hb_Resource == NULL)
6050  {
6051  return ER_FAILED;
6052  }
6053 
6054  rv = pthread_mutex_lock (&hb_Resource->lock);
6055  if (hb_Resource->shutdown == true)
6056  {
6057  /* resources have already been cleaned up */
6058  pthread_mutex_unlock (&hb_Resource->lock);
6059 
6060  return NO_ERROR;
6061  }
6062  hb_Resource->shutdown = true;
6063  pthread_mutex_unlock (&hb_Resource->lock);
6064 
6066  if (error != NO_ERROR)
6067  {
6068  assert (false);
6069  }
6070  else
6071  {
6072  snprintf (error_string, LINE_MAX, "CUBRID heartbeat starts to shutdown all HA processes.");
6074  error_string);
6075  }
6076 
6077  return error;
6078 }
6079 
6080 /*
6081  * hb_deactivate_heartbeat -
6082  * return:
6083  *
6084  */
6085 int
6087 {
6088  char error_string[LINE_MAX] = "";
6089 
6090  if (hb_Cluster == NULL)
6091  {
6092  return ER_FAILED;
6093  }
6094 
6095  if (hb_Is_activated == false)
6096  {
6097  snprintf (error_string, LINE_MAX, "%s. (CUBRID heartbeat feature already deactivated)", HB_RESULT_FAILURE_STR);
6099 
6100  return NO_ERROR;
6101  }
6102 
6103  if (hb_Resource != NULL && resource_Jobs != NULL)
6104  {
6106  }
6107 
6108  if (hb_Cluster != NULL && cluster_Jobs != NULL)
6109  {
6111  }
6112 
6113  hb_Is_activated = false;
6114 
6115  snprintf (error_string, LINE_MAX, "%s.", HB_RESULT_SUCCESS_STR);
6117 
6118  return NO_ERROR;
6119 }
6120 
6121 /*
6122  * hb_activate_heartbeat -
6123  * return: none
6124  *
6125  * str(out):
6126  */
6127 int
6129 {
6130  int error;
6131  char error_string[LINE_MAX] = "";
6132 
6133  if (hb_Cluster == NULL)
6134  {
6135  return ER_FAILED;
6136  }
6137 
6138  /* unfinished job of deactivation exists */
6139  if (hb_Deactivate_info.info_started == true)
6140  {
6141  snprintf (error_string, LINE_MAX, "%s. (CUBRID heartbeat feature is being deactivated)", HB_RESULT_FAILURE_STR);
6143  return ER_FAILED;
6144  }
6145 
6146  if (hb_Is_activated == true)
6147  {
6148  snprintf (error_string, LINE_MAX, "%s. (CUBRID heartbeat feature already activated)", HB_RESULT_FAILURE_STR);
6150  return NO_ERROR;
6151  }
6152 
6153  error = hb_master_init ();
6154  if (error != NO_ERROR)
6155  {
6156  snprintf (error_string, LINE_MAX, "%s. (failed to initialize CUBRID heartbeat feature)", HB_RESULT_FAILURE_STR);
6158 
6159  return ER_FAILED;
6160  }
6161 
6162  hb_Is_activated = true;
6163 
6164  snprintf (error_string, LINE_MAX, "%s.", HB_RESULT_SUCCESS_STR);
6166 
6167  return NO_ERROR;
6168 }
6169 
6170 /*
6171  * hb_start_util_process -
6172  * return: none
6173  *
6174  * args(in):
6175  * str(out):
6176  */
6177 int
6179 {
6180  char error_string[LINE_MAX] = "";
6181  HB_PROC_ENTRY *proc;
6182  int pid;
6183 
6184  char executable_path[PATH_MAX];
6185  int i, num_args = 0;
6186  char *s, *save;
6188  char *argvp[HB_MAX_NUM_PROC_ARGV];
6189 
6190  if (hb_Resource == NULL)
6191  {
6192  return ER_FAILED;
6193  }
6194 
6195  (void) pthread_mutex_lock (&hb_Resource->lock);
6196  proc = hb_return_proc_by_args (args);
6197  if (proc != NULL)
6198  {
6199  (void) pthread_mutex_unlock (&hb_Resource->lock);
6200 
6201  snprintf (error_string, LINE_MAX, "%s. (process already running, args:%s)", HB_RESULT_FAILURE_STR, args);
6203  error_string);
6204  return NO_ERROR;
6205  }
6206 
6207  pid = fork ();
6208  if (pid < 0)
6209  {
6210  (void) pthread_mutex_unlock (&hb_Resource->lock);
6211 
6213  return ER_FAILED;
6214  }
6215  else if (pid == 0)
6216  {
6217  memset (argvp, 0, sizeof (argvp));
6218  memset (argvs, 0, sizeof (argvs));
6219  s = strtok_r (args, " \t\n", &save);
6220  while (s)
6221  {
6222  strncpy (argvs[num_args++], s, HB_MAX_SZ_PROC_ARGV - 1);
6223  s = strtok_r (NULL, " \t\n", &save);
6224  }
6225 
6226  for (i = 0; i < num_args; i++)
6227  {
6228  argvp[i] = argvs[i];
6229  }
6230 
6231  envvar_bindir_file (executable_path, PATH_MAX, argvp[0]);
6232  (void) execv (executable_path, argvp);
6233 
6234  (void) pthread_mutex_unlock (&hb_Resource->lock);
6235  css_master_cleanup (SIGTERM);
6236  return NO_ERROR;
6237  }
6238  else
6239  {
6240  (void) pthread_mutex_unlock (&hb_Resource->lock);
6241  }
6242 
6243  return NO_ERROR;
6244 }
6245 
6246 
6247 /*
6248  * common
6249  */
6250 
6251 void
6253 {
6254  int rv;
6255 
6257 
6259  hb_Nolog_event_msg[0] = '\0';
6260 
6262  return;
6263 }
6264 
6265 void
6266 hb_disable_er_log (int reason, const char *msg_fmt, ...)
6267 {
6268  va_list args;
6269  char *p, *last;
6270  const char *event_name;
6271  char time_str[64];
6272  struct timeval curr_time;
6273  int rv;
6274 
6276 
6277  if (css_Master_er_log_enabled == false)
6278  {
6280  return;
6281  }
6282 
6283  if (reason == HB_NOLOG_DEMOTE_ON_DISK_FAIL)
6284  {
6285  event_name = "DEMOTE ON DISK FAILURE";
6286  }
6287  else if (reason == HB_NOLOG_REMOTE_STOP)
6288  {
6289  event_name = "REMOTE STOP";
6290  }
6291  else
6292  {
6294  return;
6295  }
6296 
6297  css_Master_er_log_enabled = false;
6298 
6299  p = hb_Nolog_event_msg;
6300  last = hb_Nolog_event_msg + sizeof (hb_Nolog_event_msg);
6301 
6302  gettimeofday (&curr_time, NULL);
6303 
6304  p += snprintf (p, MAX ((last - p), 0), "[%s][%s]", hb_strtime (time_str, sizeof (time_str), &curr_time), event_name);
6305 
6306  if (msg_fmt != NULL)
6307  {
6308  va_start (args, msg_fmt);
6309  vsnprintf (p, MAX ((last - p), 0), msg_fmt, args);
6310  va_end (args);
6311  }
6312 
6314  return;
6315 }
6316 
6317 /*
6318  * hb_check_ping -
6319  * return : int
6320  *
6321  */
6322 static int
6323 hb_check_ping (const char *host)
6324 {
6325 #define PING_COMMAND_FORMAT \
6326 "ping -w 1 -c 1 %s >/dev/null 2>&1; " \
6327 "echo $?"
6328 
6329  char ping_command[256], result_str[16];
6330  char buf[128];
6331  char *end_p;
6332  int result = 0;
6333  int ping_result;
6334  FILE *fp;
6335  HB_NODE_ENTRY *node;
6336 
6337  /* If host_p is in the cluster node, then skip to check */
6338  for (node = hb_Cluster->nodes; node; node = node->next)
6339  {
6340  if (are_hostnames_equal (host, node->host_name))
6341  {
6342  /* PING Host is same as cluster's host name */
6343  snprintf (buf, sizeof (buf), "Useless PING host name %s", host);
6345  return HB_PING_USELESS_HOST;
6346  }
6347  }
6348 
6349  snprintf (ping_command, sizeof (ping_command), PING_COMMAND_FORMAT, host);
6350  fp = popen (ping_command, "r");
6351  if (fp == NULL)
6352  {
6353  /* ping open fail */
6354  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "PING command pork failed");
6355  return HB_PING_SYS_ERR;
6356  }
6357 
6358  if (fgets (result_str, sizeof (result_str), fp) == NULL)
6359  {
6360  pclose (fp);
6361  MASTER_ER_SET (ER_ERROR_SEVERITY, ARG_FILE_LINE, ER_HB_NODE_EVENT, 1, "Can't get PING result");
6362  return HB_PING_SYS_ERR;
6363  }
6364 
6365  result_str[sizeof (result_str) - 1] = 0;
6366 
6367  pclose (fp);
6368 
6369  result = str_to_int32 (&ping_result, &end_p, result_str, 10);
6370  if (result != 0 || ping_result != NO_ERROR)
6371  {
6372  /* ping failed */
6373  snprintf (buf, sizeof (buf), "PING failed for host %s", host);
6375 
6376  return HB_PING_FAILURE;
6377  }
6378 
6379  return HB_PING_SUCCESS;
6380 }
6381 
6382 static int
6383 hb_help_sprint_ping_host_info (char *buffer, int max_length)
6384 {
6385  HB_PING_HOST_ENTRY *ping_host;
6386  char *p, *last;
6387 
6388  if (*buffer != '\0')
6389  {
6390  memset (buffer, 0, max_length);
6391  }
6392 
6393  p = buffer;
6394  last = buffer + max_length;
6395 
6396  p += snprintf (p, MAX ((last - p), 0), "HA Ping Host Info\n");
6397  p +=
6398  snprintf (p, MAX ((last - p), 0),
6399  "==============================" "==================================================\n");
6400 
6401  p +=
6402  snprintf (p, MAX ((last - p), 0), " * PING check is %s\n",
6403  hb_Cluster->is_ping_check_enabled ? "enabled" : "disabled");
6404  p +=
6405  snprintf (p, MAX ((last - p), 0),
6406  "------------------------------" "--------------------------------------------------\n");
6407  p += snprintf (p, MAX ((last - p), 0), "%-20s %-20s\n", "hostname", "PING check result");
6408  p +=
6409  snprintf (p, MAX ((last - p), 0),
6410  "------------------------------" "--------------------------------------------------\n");
6411  for (ping_host = hb_Cluster->ping_hosts; ping_host; ping_host = ping_host->next)
6412  {
6413  p +=
6414  snprintf (p, MAX ((last - p), 0), "%-20s %-20s\n", ping_host->host_name,
6415  hb_ping_result_string (ping_host->ping_result));
6416  }
6417  p +=
6418  snprintf (p, MAX ((last - p), 0),
6419  "==============================" "==================================================\n");
6420 
6421  return p - buffer;
6422 }
6423 
6424 static int
6425 hb_help_sprint_nodes_info (char *buffer, int max_length)
6426 {
6427  HB_NODE_ENTRY *node;
6428  char *p, *last;
6429 
6430  if (*buffer != '\0')
6431  {
6432  memset (buffer, 0, max_length);
6433  }
6434 
6435  p = buffer;
6436  last = buffer + max_length;
6437 
6438  p += snprintf (p, MAX ((last - p), 0), "HA Node Info\n");
6439  p +=
6440  snprintf (p, MAX ((last - p), 0),
6441  "==============================" "==================================================\n");
6442  p +=
6443  snprintf (p, MAX ((last - p), 0), " * group_id : %s host_name : %s state : %s \n", hb_Cluster->group_id,
6444  hb_Cluster->host_name, hb_node_state_string (hb_Cluster->state));
6445  p +=
6446  snprintf (p, MAX ((last - p), 0),
6447  "------------------------------" "--------------------------------------------------\n");
6448  p +=
6449  snprintf (p, MAX ((last - p), 0), "%-20s %-10s %-15s %-10s %-20s\n", "name", "priority", "state", "score",
6450  "missed heartbeat");
6451  p +=
6452  snprintf (p, MAX ((last - p), 0),
6453  "------------------------------" "--------------------------------------------------\n");
6454 
6455  for (node = hb_Cluster->nodes; node; node = node->next)
6456  {
6457  p +=
6458  snprintf (p, MAX ((last - p), 0), "%-20s %-10u %-15s %-10d %-20d\n", node->host_name, node->priority,
6459  hb_node_state_string (node->state), node->score, node->heartbeat_gap);
6460  }
6461 
6462  p +=
6463  snprintf (p, MAX ((last - p), 0),
6464  "==============================" "==================================================\n");
6465  p += snprintf (p, MAX ((last - p), 0), "\n");
6466 
6467  return p - buffer;
6468 }
6469 
6470 static int
6471 hb_help_sprint_processes_info (char *buffer, int max_length)
6472 {
6473  HB_PROC_ENTRY *proc;
6474  char *p, *last;
6475 
6476  if (*buffer != '\0')
6477  {
6478  memset (buffer, 0, max_length);
6479  }
6480 
6481  p = buffer;
6482  last = p + max_length;
6483 
6484  p += snprintf (p, MAX ((last - p), 0), "HA Process Info\n");
6485 
6486  p +=
6487  snprintf (p, MAX ((last - p), 0),
6488  "==============================" "==================================================\n");
6489  p += snprintf (p, MAX ((last - p), 0), " * state : %s \n", hb_node_state_string (hb_Cluster->state));
6490  p +=
6491  snprintf (p, MAX ((last - p), 0),
6492  "------------------------------" "--------------------------------------------------\n");
6493  p += snprintf (p, MAX ((last - p), 0), "%-10s %-22s %-15s %-10s\n", "pid", "state", "type", "socket fd");
6494  p += snprintf (p, MAX ((last - p), 0), " %-30s %-35s\n", "exec-path", "args");
6495  p +=
6496  snprintf (p, MAX ((last - p), 0),
6497  "------------------------------" "--------------------------------------------------\n");
6498 
6499  for (proc = hb_Resource->procs; proc; proc = proc->next)
6500  {
6501  if (proc->state == HB_PSTATE_UNKNOWN)
6502  {
6503  continue;
6504  }
6505 
6506  p +=
6507  snprintf (p, MAX ((last - p), 0), "%-10d %-22s %-15s %-10d\n", proc->pid,
6508  hb_process_state_string (proc->type, proc->state), hb_process_type_string (proc->type), proc->sfd);
6509  p += snprintf (p, MAX ((last - p), 0), " %-30s %-35s\n", proc->exec_path, proc->args);
6510  }
6511 
6512  p +=
6513  snprintf (p, MAX ((last - p), 0),
6514  "==============================" "==================================================\n");
6515  p += snprintf (p, MAX ((last - p), 0), "\n");
6516 
6517  return p - buffer;
6518 }
6519 
6520 static int
6521 hb_help_sprint_jobs_info (HB_JOB * jobs, char *buffer, int max_length)
6522 {
6523  int rv;
6524  HB_JOB_ENTRY *job;
6525  char *p, *last;
6526 
6527  p = (char *) &buffer[0];
6528  last = p + sizeof (buffer);
6529 
6530  p += snprintf (p, MAX ((last - p), 0), "HA Job Info\n");
6531  p +=
6532  snprintf (p, MAX ((last - p), 0),
6533  "==============================" "==================================================\n");
6534  p += snprintf (p, MAX ((last - p), 0), "%-10s %-20s %-20s %-20s\n", "type", "func", "arg", "expire");
6535  p +=
6536  snprintf (p, MAX ((last - p), 0),
6537  "------------------------------" "--------------------------------------------------\n");
6538 
6539  rv = pthread_mutex_lock (&jobs->lock);
6540  for (job = jobs->jobs; job; job = job->next)
6541  {
6542  p +=
6543  snprintf (p, MAX ((last - p), 0), "%-10d %-20p %-20p %-10d.%-10d\n", job->type, (void *) job->func,
6544  (void *) job->arg, (unsigned int) job->expire.tv_sec, (unsigned int) job->expire.tv_usec);
6545  }
6546 
6547  pthread_mutex_unlock (&jobs->lock);
6548 
6549  p +=
6550  snprintf (p, MAX ((last - p), 0),
6551  "==============================" "==================================================\n");
6552  p += snprintf (p, MAX ((last - p), 0), "\n");
6553 
6554  return p - buffer;
6555 }
6556 
6557 int
6559 {
6560  int rv, error, result;
6561  struct sockaddr_in req_addr;
6562  struct in_addr node_addr;
6563  socklen_t req_addr_len;
6564  HB_NODE_ENTRY *node;
6565 
6566  req_addr_len = sizeof (req_addr);
6567 
6568  if (getpeername (sd, (struct sockaddr *) &req_addr, &req_addr_len) < 0)
6569  {
6570  return HB_HC_FAILED;
6571  }
6572 
6573  /* from localhost */
6574  if (req_addr.sin_family == AF_UNIX)
6575  {
6576  return HB_HC_ELIGIBLE_LOCAL;
6577  }
6578 
6579  rv = pthread_mutex_lock (&hb_Cluster->lock);
6580 
6581  result = HB_HC_UNAUTHORIZED;
6582  for (node = hb_Cluster->nodes; node; node = node->next)
6583  {
6584  error = hb_hostname_to_sin_addr (node->host_name, &node_addr);
6585  if (error != NO_ERROR)
6586  {
6587  MASTER_ER_LOG_DEBUG (ARG_FILE_LINE, "Failed to resolve IP address of %s", node->host_name);
6588  result = HB_HC_FAILED;
6589  continue;
6590  }
6591 
6592  if (memcmp (&req_addr.sin_addr, &node_addr, sizeof (struct in_addr)) == 0)
6593  {
6594  pthread_mutex_unlock (&hb_Cluster->lock);
6595  return HB_HC_ELIGIBLE_REMOTE;
6596  }
6597  }
6598  pthread_mutex_unlock (&hb_Cluster->lock);
6599 
6600  return result;
6601 }
6602 
6603 /*
6604  * hb_start_deactivate_server_info -
6605  * Initialize hb_Server_deactivate_info,
6606  * and set info_started flag to true.
6607  *
6608  * return: none
6609  */
6610 void
6612 {
6613  assert (hb_Deactivate_info.info_started == false);
6614 
6615  if (hb_Deactivate_info.server_pid_list != NULL)
6616  {
6617  free_and_init (hb_Deactivate_info.server_pid_list);
6618  }
6619 
6620  hb_Deactivate_info.server_count = 0;
6621  hb_Deactivate_info.info_started = true;
6622 }
6623 
6624 bool
6626 {
6627  return hb_Deactivate_info.info_started;
6628 }
6629 
6630 bool
6632 {
6633  HB_PROC_ENTRY *proc;
6634 
6635  pthread_mutex_lock (&hb_Resource->lock);
6636  for (proc = hb_Resource->procs; proc; proc = proc->next)
6637  {
6638  if (proc->conn != NULL)
6639  {
6640  pthread_mutex_unlock (&hb_Resource->lock);
6641  return false;
6642  }
6643  assert (proc->sfd == INVALID_SOCKET);
6644  }
6645  pthread_mutex_unlock (&hb_Resource->lock);
6646 
6647  return true;
6648 }
6649 
6650 
6651 /*
6652  * hb_get_deactivating_server_count -
6653  *
6654  * return: none
6655  */
6656 int
6658 {
6659  int i, num_active_server = 0;
6660 
6661  if (hb_Deactivate_info.info_started == true)
6662  {
6663  for (i = 0; i < hb_Deactivate_info.server_count; i++)
6664  {
6665  if (hb_Deactivate_info.server_pid_list[i] > 0)
6666  {
6667  if (kill (hb_Deactivate_info.server_pid_list[i], 0) && errno == ESRCH)
6668  {
6669  /* server was terminated */
6670  hb_Deactivate_info.server_pid_list[i] = 0;
6671  }
6672  else
6673  {
6674  num_active_server++;
6675  }
6676  }
6677  }
6678 
6679  return num_active_server;
6680  }
6681 
6682  return 0;
6683 }
6684 
6685 /*
6686  * hb_finish_deactivate_server_info -
6687  * clear hb_Server_deactivate_info.
6688  * and set info_started flag to false.
6689  *
6690  * return: none
6691  */
6692 void
6694 {
6695  if (hb_Deactivate_info.server_pid_list != NULL)
6696  {
6697  free_and_init (hb_Deactivate_info.server_pid_list);
6698  }
6699 
6700  hb_Deactivate_info.server_count = 0;
6701  hb_Deactivate_info.info_started = false;
6702 }
6703 
6704 /*
6705  * hb_return_proc_state_by_fd() -
6706  * return: process state
6707  *
6708  * sfd(in):
6709  */
6710 int
6712 {
6713  int rv;
6714  int state = 0;
6715  HB_PROC_ENTRY *proc;
6716 
6717  if (hb_Resource == NULL)
6718  {
6719  return HB_PSTATE_UNKNOWN;
6720  }
6721 
6722  rv = pthread_mutex_lock (&hb_Resource->lock);
6723  proc = hb_return_proc_by_fd (sfd);
6724  if (proc == NULL)
6725  {
6726  pthread_mutex_unlock (&hb_Resource->lock);
6727  return HB_PSTATE_DEAD;
6728  }
6729 
6730  state = (int) proc->state;
6731 
6732  if (proc->server_hang)
6733  {
6734  state = HB_PSTATE_DEAD;
6735  }
6736  pthread_mutex_unlock (&hb_Resource->lock);
6737 
6738  return state;
6739 }
6740 
6741 /*
6742  * hb_is_hang_process() -
6743  * return:
6744  *
6745  * sfd(in):
6746  */
6747 bool
6749 {
6750  int rv;
6751  HB_PROC_ENTRY *proc;
6752 
6753  if (hb_Resource == NULL)
6754  {
6755  return false;
6756  }
6757 
6758  rv = pthread_mutex_lock (&hb_Resource->lock);
6759  proc = hb_return_proc_by_fd (sfd);
6760  if (proc == NULL)
6761  {
6762  pthread_mutex_unlock (&hb_Resource->lock);
6763  return false;
6764  }
6765 
6766  if (proc->server_hang)
6767  {
6768  pthread_mutex_unlock (&hb_Resource->lock);
6769  return true;
6770  }
6771  pthread_mutex_unlock (&hb_Resource->lock);
6772 
6773  return false;
6774 }
static void hb_resource_job_change_mode(HB_JOB_ARG *arg)
static void hb_cluster_job_failover(HB_JOB_ARG *arg)
#define HB_PSTATE_REGISTERED_AND_STANDBY_STR
#define SLEEP_MILISEC(sec, msec)
Definition: util_func.h:40
static void hb_resource_send_get_eof(void)
HB_LIST ** prev
struct hbp_proc_register HBP_PROC_REGISTER
Definition: heartbeat.h:137
static void hb_resource_demote_kill_server_proc(void)
void hb_finish_deactivate_server_info(void)
HB_NODE_STATE_TYPE state
#define HB_PSTATE_REGISTERED_AND_ACTIVE_STR
static int hb_check_ping(const char *host)
unsigned int seq
Definition: heartbeat.h:125
#define HB_NSTATE_STR_SZ
Definition: heartbeat.h:104
#define HB_TEMP_CHECK_VALID_PING_SERVER_INTERVAL_IN_MSECS
Definition: heartbeat.h:39
#define HB_PSTATE_NOT_REGISTERED_STR
static void * hb_thread_check_disk_failure(void *arg)
static void hb_resource_job_cleanup_all(HB_JOB_ARG *arg)
#define HB_VALID_NO_ERROR_STR
int hb_master_init(void)
#define NO_ERROR
Definition: error_code.h:46
struct timeval last_recv_hbtime
#define EXIT_FUNC()
#define HA_NODE_SCORE_FORMAT_STRING
#define HA_ADMIN_INFO_FORMAT_STRING
unsigned short changemode_gap
int hb_start_util_process(char *args)
#define HB_PING_SUCCESS_STR
static int hb_help_sprint_ping_host_info(char *buffer, int max_length)
#define ER_CSS_PTHREAD_ATTR_SETDETACHSTATE
Definition: error_code.h:992
char * name
Definition: master_util.h:84
#define HB_MAX_PING_CHECK
HB_JOB_ENTRY ** prev
HB_NODE_STATE_TYPE state
static int hb_help_sprint_jobs_info(HB_JOB *jobs, char *buffer, int max_length)
#define HB_MAX_CHANGEMODE_DIFF_TO_KILL
HB_CLUSTER_JOB_ARG cluster_job_arg
int SOCKET
Definition: porting.h:482
#define PING_COMMAND_FORMAT
struct timeval rtime
static void hb_cluster_job_heartbeat(HB_JOB_ARG *arg)
static void hb_list_move(HB_LIST **dest_pp, HB_LIST **source_pp)
static char hb_Nolog_event_msg[LINE_MAX]
void hb_deregister_by_args(char *args)
static bool hb_cluster_is_received_heartbeat_from_all(void)
static HB_JOB_ENTRY * hb_job_dequeue(HB_JOB *jobs)
unsigned char state
void hb_get_admin_info_string(char **str)
#define HA_PROCESS_REGISTER_TIME_FORMAT_STRING
#define pthread_mutex_init(a, b)
Definition: area_alloc.c:48
void LSA_COPY(log_lsa *plsa1, const log_lsa *plsa2)
Definition: log_lsa.hpp:139
HB_NODE_ENTRY * master
static HB_JOB_FUNC hb_resource_jobs[]
char host_name[CUB_MAXHOSTNAMELEN]
bool is_curr_eof_received
static void hb_remove_ping_host(HB_PING_HOST_ENTRY *entry_p)
static int hb_resource_send_changemode(HB_PROC_ENTRY *proc)
unsigned int htonl(unsigned int from)
void hb_reconfig_heartbeat(char **str)
HB_PING_HOST_ENTRY ** prev
#define ER_FAILED
Definition: error_code.h:47
int sfd
int hb_activate_heartbeat(void)
void hb_kill_all_heartbeat_process(char **str)
HB_JOB_ARG * arg
#define pthread_mutex_unlock(a)
Definition: area_alloc.c:51
#define HB_NODE_SCORE_MASTER
#define HB_PSTATE_UNKNOWN_STR
static const char * hb_strtime(char *s, unsigned int max, struct timeval *tv_p)
void(* HB_JOB_FUNC)(HB_JOB_ARG *)
void css_remove_entry_by_conn(CSS_CONN_ENTRY *conn_p, SOCKET_QUEUE_ENTRY **anchor_p)
Definition: master.c:1287
struct timeval ftime
#define HB_NODE_SCORE_UNKNOWN
bool are_hostnames_equal(const char *hostname_a, const char *hostname_b)
Definition: util_common.c:449
static int hb_cluster_calc_score(void)
char group_id[HB_MAX_GROUP_ID_LEN]
Definition: heartbeat.h:126
static void hb_remove_all_procs(HB_PROC_ENTRY *first)
#define assert_release(e)
Definition: error_manager.h:96
HB_RESOURCE_JOB_ARG resource_job_arg
static bool hb_resource_check_server_log_grow(void)
static int hb_cluster_load_ping_host_list(char *ha_ping_host_list)
#define ER_CSS_PTHREAD_CREATE
Definition: error_code.h:995
void hb_register_new_process(CSS_CONN_ENTRY *conn)
SOCKET fd
#define HB_IPV4_STR_LEN
static int hb_hostname_to_sin_addr(const char *host, struct in_addr *addr)
static void hb_resource_job_proc_start(HB_JOB_ARG *arg)
pthread_mutex_t css_Master_socket_anchor_lock
Definition: master.c:127
#define OR_ALIGNED_BUF(size)
#define ER_HB_NODE_EVENT
Definition: error_code.h:1238
static const char * hb_node_state_string(int nstate)
struct sockaddr_in saddr
#define HB_VALID_CANNOT_RESOLVE_HOST_STR
#define ER_PRM_BAD_VALUE
Definition: error_code.h:1048
#define MASTER_TO_SRV_MSG_SIZE
#define HA_PROCESS_DEREGISTER_TIME_FORMAT_STRING
#define HB_PSTATE_REGISTERED_STR
HB_NODE_STATE_TYPE state
#define LSA_AS_ARGS(lsa_ptr)
Definition: log_lsa.hpp:78
void css_process_start_shutdown(SOCKET_QUEUE_ENTRY *sock_entq, int timeout, char *buffer)
char host_name[CUB_MAXHOSTNAMELEN]
#define OR_ALIGNED_BUF_SIZE(abuf)
pthread_mutex_t lock
static int hb_resource_job_initialize(void)
HB_NODE_ENTRY * next
HB_JOB_FUNC func
char args[HB_MAX_SZ_PROC_ARGS]
#define PTR_ALIGN(addr, boundary)
Definition: memory_alloc.h:77
#define HA_ADMIN_INFO_NOLOG_FORMAT_STRING
int css_receive_heartbeat_data(CSS_CONN_ENTRY *conn, char *data, int size)
Definition: heartbeat.c:230
HB_JOB * cluster_Jobs
char * or_unpack_log_lsa(char *ptr, struct log_lsa *lsa)
#define INVALID_SOCKET
Definition: porting.h:483
unsigned short len
Definition: heartbeat.h:124
#define HB_PROC_RECOVERY_DELAY_TIME
#define HB_PSTATE_REGISTERED_AND_TO_BE_STANDBY_STR
#define ERR_CSS_TCP_HOST_NAME_ERROR
Definition: error_code.h:426
#define HA_COPYLOG_PROCESS_FORMAT_STRING
static void * hb_thread_resource_worker(void *arg)
static void hb_resource_cleanup(void)
#define HB_PING_FAILURE_STR
#define MAX_ALIGNMENT
Definition: memory_alloc.h:70
#define ER_HB_STARTED
Definition: error_code.h:1236
#define HB_DISK_FAILURE_CHECK_TIMER_IN_MSECS
Definition: heartbeat.h:54
static void hb_remove_ui_node(HB_UI_NODE_ENTRY *node)
void hb_enable_er_log(void)
static const char * hb_valid_result_string(int v_result)
static pthread_mutex_t gethostbyname_lock
Definition: tcp.c:82
static void hb_resource_job_demote_confirm_shutdown(HB_JOB_ARG *arg)
static int hb_cluster_send_heartbeat_resp(struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name)
bool being_shutdown
#define HB_RESULT_FAILURE_STR
int hb_return_proc_state_by_fd(int sfd)
static void hb_cluster_remove_all_ui_nodes(HB_UI_NODE_ENTRY *first)
#define INADDR_NONE
Definition: tcp.c:89
#define OR_ALIGNED_BUF_START(abuf)
HB_NODE_ENTRY ** prev
#define MASTER_ER_SET_WITH_OSERROR(...)
Definition: master_util.h:56
#define HB_UI_NODE_CACHE_TIME_IN_MSECS
#define HA_GET_MODE()
static char * node_name
Definition: cas_runner.c:174
static void hb_cluster_job_init(HB_JOB_ARG *arg)
int css_send_heartbeat_data(CSS_CONN_ENTRY *conn, const char *data, int size)
Definition: heartbeat.c:178
char host_name[CUB_MAXHOSTNAMELEN]
UINT64 prm_get_bigint_value(PARAM_ID prm_id)
#define HB_PING_STR_SIZE
static void * hb_thread_cluster_worker(void *arg)
LOG_LSA curr_eof
static void hb_list_remove(HB_LIST *n)
#define HB_PING_UNKNOWN_STR
#define HB_CMD_UTIL_START_STR
static HB_UI_NODE_ENTRY * hb_return_ui_node(char *host_name, char *group_id, struct sockaddr_in saddr)
#define HA_NODE_FORMAT_STRING
unsigned char type
char host_name[CUB_MAXHOSTNAMELEN]
static int hb_cluster_initialize(const char *nodes, const char *replicas)
static void hb_resource_demote_start_shutdown_server_proc(void)
#define assert(x)
static void hb_remove_proc(HB_PROC_ENTRY *entry_p)
static void hb_cluster_job_check_valid_ping_server(HB_JOB_ARG *arg)
static void hb_cluster_job_demote(HB_JOB_ARG *arg)
HB_NODE_ENTRY * myself
int prm_get_integer_value(PARAM_ID prm_id)
#define HB_VALID_UNIDENTIFIED_NODE_STR
void hb_resource_receive_changemode(CSS_CONN_ENTRY *conn)
struct timeval stime
void css_master_cleanup(int sig)
Definition: master.c:218
char group_id[HB_MAX_GROUP_ID_LEN]
static bool hb_cluster_is_isolated(void)
static int hb_cluster_job_set_expire_and_reorder(unsigned int job_type, unsigned int msec)
#define ER_OUT_OF_VIRTUAL_MEMORY
Definition: error_code.h:50
void hb_start_deactivate_server_info(void)
static void hb_resource_job_demote_start_shutdown(HB_JOB_ARG *arg)
#define HB_MAX_SZ_PROC_ARGS
Definition: heartbeat.h:83
static void hb_cluster_remove_all_nodes(HB_NODE_ENTRY *first)
HB_PROC_ENTRY ** prev
static int hb_thread_initialize(void)
static char executable_path[PATH_MAX]
Definition: server.c:73
static void hb_cluster_cleanup(void)
#define ER_CSS_PTHREAD_ATTR_SETSCOPE
Definition: error_code.h:993
const char * css_ha_server_state_string(HA_SERVER_STATE state)
void hb_resource_receive_get_eof(CSS_CONN_ENTRY *conn)
#define ER_HA_GENERIC_ERROR
Definition: error_code.h:1297
#define ER_HB_PROCESS_EVENT
Definition: error_code.h:1239
short score
bool hb_is_registered_process(CSS_CONN_ENTRY *conn, char *args)
#define ERR_CSS_CANNOT_FORK
Definition: error_code.h:417
#define ERR_CSS_TCP_DATAGRAM_BIND
Definition: error_code.h:435
struct timeval dtime
#define HA_PROCESS_START_TIME_FORMAT_STRING
const char * db_error_string(int level)
Definition: db_admin.c:2116
void hb_resource_shutdown_and_cleanup(void)
CSS_CONN_ENTRY * conn
static int hb_help_sprint_nodes_info(char *buffer, int max_length)
static HB_NODE_ENTRY * hb_return_node_by_name_except_me(char *name)
#define HB_PING_SYS_ERR_STR
#define HB_INFO_STR_MAX
char * envvar_bindir_file(char *path, size_t size, const char *filename)
int sysprm_reload_and_init(const char *db_name, const char *conf_file)
static void * hb_thread_cluster_reader(void *arg)
#define HA_PROCESS_EXEC_PATH_FORMAT_STRING
static int rv
Definition: area_alloc.c:52
#define HB_CMD_RELOAD_STR
#define HB_PSTATE_STR_SZ
void hb_get_process_info_string(char **str, bool verbose_yn)
#define HB_CMD_DEACTIVATE_STR
const char ** argvp
Definition: dynamic_load.c:952
bool hb_is_deactivation_ready(void)
static int hb_cluster_send_heartbeat_internal(struct sockaddr_in *saddr, socklen_t saddr_len, char *dest_host_name, bool is_req)
#define NULL
Definition: freelistheap.h:34
int str_to_int32(int *ret_p, char **end_p, const char *str_p, int base)
Definition: porting.c:2346
#define strncpy_bufsize(buf, str)
Definition: porting.h:340
static void hb_add_timeval(struct timeval *tv_p, unsigned int msec)
HB_JOB_FUNC * job_funcs
static void hb_cluster_request_heartbeat_to_all(void)
static HB_UI_NODE_ENTRY * hb_add_ui_node(char *host_name, char *group_id, struct sockaddr_in saddr, int state)
unsigned short num_jobs
struct timeval expire
pid_t pid
Definition: dynamic_load.c:955
bool hb_is_hang_process(int sfd)
HB_PROC_ENTRY * procs
char exec_path[HB_MAX_SZ_PROC_EXEC_PATH]
if(extra_options)
Definition: dynamic_load.c:958
#define HB_IS_INITIALIZED_TIME(arg_time)
#define ENTER_FUNC()
#define ER_HB_COMMAND_EXECUTION
Definition: error_code.h:1240
static HB_PROC_ENTRY * hb_return_proc_by_fd(int sfd)
unsigned short htons(unsigned short from)
HB_UI_NODE_ENTRY * next
bool LSA_ISNULL(const log_lsa *lsa_ptr)
Definition: log_lsa.hpp:153
char dest_host_name[CUB_MAXHOSTNAMELEN]
Definition: heartbeat.h:128
static HB_JOB_ENTRY * hb_cluster_job_dequeue(void)
static HB_NODE_ENTRY * hb_add_node_to_cluster(char *host_name, unsigned short priority)
struct timeval frtime
static void hb_cluster_job_calc_score(HB_JOB_ARG *arg)
char * or_unpack_int(char *ptr, int *number)
#define HB_MAX_WAIT_FOR_NEW_MASTER
static int hb_job_queue(HB_JOB *jobs, unsigned int job_type, HB_JOB_ARG *arg, unsigned int msec)
#define HA_PING_HOSTS_FORMAT_STRING
static void hb_resource_job_shutdown(void)
Definition: master_util.h:75
#define HB_MAX_SZ_PROC_EXEC_PATH
Definition: heartbeat.h:80
static void hb_list_add(HB_LIST **p, HB_LIST *n)
static void hb_resource_job_proc_dereg(HB_JOB_ARG *arg)
#define OR_LOG_LSA_ALIGNED_SIZE
static HB_PING_HOST_ENTRY * hb_add_ping_host(char *host_name)
#define HB_RESULT_SUCCESS_STR
HB_PROC_ENTRY * next
static void hb_cluster_job_check_ping(HB_JOB_ARG *arg)
int count(int &result, const cub_regex_object &reg, const std::string &src, const int position, const INTL_CODESET codeset)
static const char * hb_ping_result_string(int ping_result)
bool server_hang
static bool hb_resource_demote_confirm_shutdown_server_proc(void)
#define ER_CSS_PTHREAD_ATTR_INIT
Definition: error_code.h:990
HB_PING_HOST_ENTRY * next
HB_CLUSTER * hb_Cluster
bool is_ping_check_enabled
#define MASTER_ER_LOG_DEBUG(...)
Definition: master_util.h:65
#define max(a, b)
#define ER_BO_UNABLE_TO_FIND_HOSTNAME
Definition: error_code.h:180
static void hb_kill_process(pid_t *pids, int count)
#define HA_NODE_HEARTBEAT_GAP_FORMAT_STRING
#define HB_NODE_SCORE_SLAVE
int css_send_heartbeat_request(CSS_CONN_ENTRY *conn, int command)
Definition: heartbeat.c:151
void hb_disable_er_log(int reason, const char *msg_fmt,...)
#define HA_NODE_INFO_FORMAT_STRING
static HB_PROC_ENTRY * hb_return_proc_by_pid(int pid)
HB_RESOURCE * hb_Resource
static void error(const char *msg)
Definition: gencat.c:331
HB_LIST * next
#define MASTER_ER_SET(...)
Definition: master_util.h:47
char * or_pack_int(char *ptr, int number)
static void hb_cluster_job_shutdown(void)
#define HB_PSTATE_REGISTERED_AND_TO_BE_ACTIVE_STR
HB_NODE_ENTRY * nodes
pthread_mutex_t lock
#define HB_MAX_GROUP_ID_LEN
Definition: heartbeat.h:79
static HB_JOB_ENTRY * hb_resource_job_dequeue(void)
#define HB_PSTATE_DEAD_STR
#define ARG_FILE_LINE
Definition: error_manager.h:44
#define HB_MAX_NUM_PROC_ARGV
Definition: heartbeat.h:81
static const char * hb_process_state_string(unsigned char ptype, int pstate)
HB_JOB_ENTRY * next
static int hb_resource_job_queue(unsigned int job_type, HB_JOB_ARG *arg, unsigned int msec)
unsigned short ntohs(unsigned short from)
#define HA_PROCESS_SHUTDOWN_TIME_FORMAT_STRING
#define HB_BUFFER_SZ
unsigned char type
Definition: heartbeat.h:116
#define HB_JOB_TIMER_WAIT_100_MILLISECOND
Definition: heartbeat.h:53
bool hb_Deactivate_immediately
#define HB_JOB_TIMER_WAIT_A_SECOND
Definition: heartbeat.h:51
enum HB_NODE_STATE HB_NODE_STATE_TYPE
Definition: heartbeat.h:106
const char ** argv
Definition: dynamic_load.c:952
const char * hb_process_type_string(int ptype)
Definition: heartbeat.c:104
#define free_and_init(ptr)
Definition: memory_alloc.h:147
#define strlen(s1)
Definition: intl_support.c:43
static bool hb_Is_activated
#define HB_VALID_IP_ADDR_MISMATCH_STR
char * prm_get_string_value(PARAM_ID prm_id)
void LSA_SET_NULL(log_lsa *lsa_ptr)
Definition: log_lsa.hpp:146
static HB_JOB_FUNC hb_cluster_jobs[]
#define HB_PSTATE_DEREGISTERED_STR
#define HB_MAX_CHANGEMODE_DIFF_TO_TERM
#define HB_JOB_TIMER_IMMEDIATELY
Definition: heartbeat.h:50
static void hb_remove_node(HB_NODE_ENTRY *entry_p)
static void hb_resource_job_confirm_start(HB_JOB_ARG *arg)
char group_id[HB_MAX_GROUP_ID_LEN]
bool LSA_GT(const log_lsa *plsa1, const log_lsa *plsa2)
Definition: log_lsa.hpp:188
static int hb_cluster_job_initialize(void)
static HB_JOB_ARG * hb_deregister_process(HB_PROC_ENTRY *proc)
void hb_cluster_shutdown_and_cleanup(void)
static int hb_cluster_send_heartbeat_req(char *dest_host_name)
bool prm_get_bool_value(PARAM_ID prm_id)
int hb_get_deactivating_server_count(void)
int util_log_write_errstr(const char *format,...)
Definition: util_func.c:493
#define HB_PSTATE_STARTED_STR
static HB_PROC_ENTRY * hb_alloc_new_proc(void)
struct hbp_header HBP_HEADER
Definition: heartbeat.h:113
pthread_mutex_t lock
#define HB_PING_USELESS_HOST_STR
#define HA_UI_NODE_FORMAT_STRING
static void hb_resource_shutdown_all_ha_procs(void)
SOCKET_QUEUE_ENTRY * css_return_entry_by_conn(CSS_CONN_ENTRY *conn_p, SOCKET_QUEUE_ENTRY **anchor_p)
Definition: master.c:1424
int hb_deactivate_heartbeat(void)
static void hb_resource_job_confirm_cleanup_all(HB_JOB_ARG *arg)
static HB_PROC_ENTRY * hb_return_proc_by_args(char *args)
static int hb_hostname_n_port_to_sockaddr(const char *host, int port, struct sockaddr *saddr, socklen_t *slen)
#define HB_CMD_DEREGISTER_STR
int i
Definition: dynamic_load.c:954
unsigned int type
char * msgcat_message(int cat_id, int set_id, int msg_id)
int hb_check_request_eligibility(SOCKET sd)
static void hb_cluster_job_failback(HB_JOB_ARG *arg)
const char * prm_get_name(PARAM_ID prm_id)
int pid
static int hb_set_net_header(HBP_HEADER *header, unsigned char type, bool is_req, unsigned short len, unsigned int seq, char *dest_host_name)
void hb_deregister_by_pid(pid_t pid)
static HB_DEACTIVATE_INFO hb_Deactivate_info
struct timeval ktime
static void hb_cleanup_ui_nodes(HB_UI_NODE_ENTRY *first)
#define ER_CSS_PTHREAD_ATTR_SETSTACKSIZE
Definition: error_code.h:994
static int hb_cluster_job_queue(unsigned int job_type, HB_JOB_ARG *arg, unsigned int msec)
void hb_get_ping_host_info_string(char **str)
static int hb_resource_initialize(void)
static bool hb_cluster_check_valid_ping_server(void)
char * strdup(const char *str)
Definition: porting.c:901
static int hb_help_sprint_processes_info(char *buffer, int max_length)
#define HA_DISABLED()
short heartbeat_gap
#define HB_VALID_GROUP_NAME_MISMATCH_STR
#define HB_JOB_TIMER_WAIT_500_MILLISECOND
Definition: heartbeat.h:52
#define pthread_mutex_lock(a)
Definition: area_alloc.c:50
#define HB_NODE_SCORE_TO_BE_MASTER
#define HA_PROCESS_ARGV_FORMAT_STRING
int hb_prepare_deactivate_heartbeat(void)
unsigned int ping_check_count
HB_UI_NODE_ENTRY * ui_nodes
unsigned int ntohl(unsigned int from)
#define ER_CSS_PTHREAD_ATTR_DESTROY
Definition: error_code.h:991
enum ha_server_state HA_SERVER_STATE
Definition: boot.h:126
HB_UI_NODE_ENTRY ** prev
#define HB_UI_NODE_CLEANUP_TIME_IN_MSECS
#define HB_REPLICA_PRIORITY
#define HB_GET_ELAPSED_TIME(end_time, start_time)
static void hb_cluster_receive_heartbeat(char *buffer, int len, struct sockaddr_in *from, socklen_t from_len)
static void hb_job_shutdown(HB_JOB *jobs)
pthread_mutex_t css_Master_er_log_enable_lock
Definition: master.c:130
HB_JOB_ENTRY * jobs
static int hb_compare_timeval(struct timeval *arg1, struct timeval *arg2)
#define HA_PING_HOSTS_INFO_FORMAT_STRING
bool css_Master_er_log_enabled
Definition: master.c:131
#define CUB_MAXHOSTNAMELEN
Definition: porting.h:379
static void hb_resource_job_confirm_dereg(HB_JOB_ARG *arg)
#define HB_MAX_SZ_PROC_ARGV
Definition: heartbeat.h:82
#define HA_SERVER_PROCESS_FORMAT_STRING
static int hb_cluster_load_group_and_node_list(char *ha_node_list, char *ha_replica_list)
bool hb_is_deactivation_started(void)
static char * host
static void hb_job_set_expire_and_reorder(HB_JOB *jobs, unsigned int job_type, unsigned int msec)
LOG_LSA prev_eof
char exec_path[HB_MAX_SZ_PROC_EXEC_PATH]
Definition: heartbeat.h:142
#define HA_APPLYLOG_PROCESS_FORMAT_STRING
#define HB_CMD_ACTIVATE_STR
char args[HB_MAX_SZ_PROC_ARGS]
struct timeval last_recv_time
HB_PING_HOST_ENTRY * ping_hosts
static HB_NODE_ENTRY * hb_return_node_by_name(char *name)
static void hb_cluster_remove_all_ping_hosts(HB_PING_HOST_ENTRY *first)
#define HA_PROCESS_INFO_FORMAT_STRING
static void hb_proc_make_arg(char **arg, char *args)
#define GETHOSTNAME(p, l)
Definition: porting.h:381
const char ** p
Definition: dynamic_load.c:945
HB_JOB * resource_Jobs
int v_result
int ping_result
SOCKET_QUEUE_ENTRY * css_Master_socket_anchor
Definition: master.c:125
unsigned int max_retries
static int hb_reload_config(void)
char orig_host_name[CUB_MAXHOSTNAMELEN]
Definition: heartbeat.h:127
#define ERR_CSS_TCP_DATAGRAM_SOCKET
Definition: error_code.h:438
void hb_get_node_info_string(char **str, bool verbose_yn)
static int hb_is_heartbeat_valid(char *host_name, char *group_id, struct sockaddr_in *from)
#define HA_ADMIN_INFO_NOLOG_EVENT_FORMAT_STRING
unsigned short priority
#define HB_DEFAULT_CHECK_VALID_PING_SERVER_INTERVAL_IN_MSECS
Definition: heartbeat.h:38
void hb_cleanup_conn_and_start_process(CSS_CONN_ENTRY *conn, SOCKET sfd)
char args[HB_MAX_SZ_PROC_ARGS]
Definition: heartbeat.h:143
static int hb_resource_job_set_expire_and_reorder(unsigned int job_type, unsigned int msec)
#define MSGCAT_CATALOG_UTILS