跟我一起看zabbix源码之zabbix alerter.c报警逻辑

前言:

晚上睡不着，想起白天和朋友聊监控系统，他以前在阿里和高德都做监控，算是个专业监控开发选手了。对于监控那哥们确实理解的深刻。过段时间去做嘉宾，要讲点监控的东西，为了防止吹牛逼，被人卡断。 So 随手下载了zabbix的代码，打开alerter.c文件看之，话说zabbix的c代码写的比较的鲜明。其实还有一个原因就是，当时在乐视网做有关zabbix二次开发项目的时候，对于zabbix发出去的报警信息，我和大黄本来打算不调用第三方的脚本，直接用c来做http请求。后来因为各忙各的其他的项目，这个想法也就废掉了。

更多有关zabbix开发的文章，请关注 blog.xiaorui.cc

首先需要介绍的时execute_action 函数，这个函数是对于action相关联的mediatype，已经向db反馈成功还是失败的状态。

那我们根据在当初配置的media，做不同的动作。比如是MEDIA_TYPE_EMAIL的话，那就连接stmp，发送邮件。如果是EXEC的话，那就是fork进程，也就是第三方的系统调用。

	if (MEDIA_TYPE_EMAIL == mediatype->type)
	{
		alarm(ALARM_ACTION_TIMEOUT);
		res = send_email(mediatype->smtp_server, mediatype->smtp_helo, mediatype->smtp_email,
				alert->sendto, alert->subject, alert->message, error, max_error_len);
		alarm(0);
	}
#ifdef HAVE_JABBER
	else if (MEDIA_TYPE_JABBER == mediatype->type)
	{
		/* Jabber uses its own timeouts */
		res = send_jabber(mediatype->username, mediatype->passwd,
				alert->sendto, alert->subject, alert->message, error, max_error_len);
	}
#endif
	else if (MEDIA_TYPE_SMS == mediatype->type)
	{
		/* SMS uses its own timeouts */
		res = send_sms(mediatype->gsm_modem, alert->sendto, alert->message, error, max_error_len);
	}
	else if (MEDIA_TYPE_EZ_TEXTING == mediatype->type)
	{
		/* Ez Texting uses its own timeouts */
		res = send_ez_texting(mediatype->username, mediatype->passwd,
				alert->sendto, alert->message, mediatype->exec_path, error, max_error_len);
	}
	else if (MEDIA_TYPE_EXEC == mediatype->type)

这下面是zabbix里面具体调用scripts脚本的过程。

		if (0 == access(cmd, X_OK))
		{
			send_to = zbx_dyn_escape_string(alert->sendto, "\"\\");
			subject = zbx_dyn_escape_string(alert->subject, "\"\\");
			message = zbx_dyn_escape_string(alert->message, "\"\\");

			zbx_snprintf_alloc(&cmd, &cmd_alloc, &cmd_offset, " \"%s\" \"%s\" \"%s\"",
					send_to, subject, message);

			zbx_free(send_to);
			zbx_free(subject);
			zbx_free(message);

			if (SUCCEED == (res = zbx_execute(cmd, &output, error, max_error_len, ALARM_ACTION_TIMEOUT)))
			{
				zabbix_log(LOG_LEVEL_DEBUG, "%s output:\n%s", mediatype->exec_path, output);
				zbx_free(output);
			}
			else
				res = FAIL;
		}
		else
			zbx_snprintf(error, max_error_len, "%s: %s", cmd, zbx_strerror(errno));

		zbx_free(cmd);

上面的是关于触发action的相关函数，那肯定还有个在一直调用execute_action函数的主main函数吧。他的函数名字是 main_alerter_loop 。既然是loop，那就知道他是做啥的了，逻辑很简单，zabbix_server 启动后，fork出main_alerter_loop函数来，让他独立负责报警这件事情。

关于zabbix日志记录逻辑:

	zabbix_log(LOG_LEVEL_INFORMATION, "%s #%d started [%s #%d]", get_daemon_type_string(daemon_type),
			server_num, get_process_type_string(process_type), process_num);

创建一个DB连接的对象

DBconnect(ZBX_DB_CONNECT_NORMAL);

通过Mysql查询alerts未发送的任务，通过media查到行为的方式。

		result = DBselect(
				"select a.alertid,a.mediatypeid,a.sendto,a.subject,a.message,a.status,mt.mediatypeid,"
				"mt.type,mt.description,mt.smtp_server,mt.smtp_helo,mt.smtp_email,mt.exec_path,"
				"mt.gsm_modem,mt.username,mt.passwd,a.retries"
				" from alerts a,media_type mt"
				" where a.mediatypeid=mt.mediatypeid"
					" and a.status=%d"
					" and a.alerttype=%d"
				" order by a.alertid",
				ALERT_STATUS_NOT_SENT,
				ALERT_TYPE_MESSAGE);

他是一次性的把没有发送，也就是未执行的报警任务，都给取出来，然后传递给execute_action去处理报警的逻辑。

			ZBX_STR2UINT64(alert.alertid, row[0]);
			ZBX_STR2UINT64(alert.mediatypeid, row[1]);
			alert.sendto = row[2];
			alert.subject = row[3];
			alert.message = row[4];
			alert.status = atoi(row[5]);

			ZBX_STR2UINT64(mediatype.mediatypeid, row[6]);
			mediatype.type = atoi(row[7]);
			mediatype.description = row[8];
			mediatype.smtp_server = row[9];
			mediatype.smtp_helo = row[10];
			mediatype.smtp_email = row[11];
			mediatype.exec_path = row[12];
			mediatype.gsm_modem = row[13];
			mediatype.username = row[14];
			mediatype.passwd = row[15];

			alert.retries = atoi(row[16]);

			*error = '\0';
			res = execute_action(&alert, &mediatype, error, sizeof(error));

虽然有不同的触发动作，但是返回值的状态都一样。下面的逻辑，是判断返回状态，入库或者是debug日志中。

			if (SUCCEED == res)
			{
				zabbix_log(LOG_LEVEL_DEBUG, "alert ID [" ZBX_FS_UI64 "] was sent successfully",
						alert.alertid);
				DBexecute("update alerts set status=%d,error='' where alertid=" ZBX_FS_UI64,
						ALERT_STATUS_SENT, alert.alertid);
				alerts_success++;
			}
			else
			{
				zabbix_log(LOG_LEVEL_DEBUG, "error sending alert ID [" ZBX_FS_UI64 "]", alert.alertid);

				error_esc = DBdyn_escape_string_len(error, ALERT_ERROR_LEN);

				alert.retries++;

				if (ALERT_MAX_RETRIES > alert.retries)
				{
					DBexecute("update alerts set retries=%d,error='%s' where alertid=" ZBX_FS_UI64,
							alert.retries, error_esc, alert.alertid);
				}
				else
				{
					DBexecute("update alerts set status=%d,retries=%d,error='%s' where alertid=" ZBX_FS_UI64,
							ALERT_STATUS_FAILED, alert.retries, error_esc, alert.alertid);
				}

				zbx_free(error_esc);

				alerts_fail++;
			}

		}

最后的几段话意思是，统计时间及sleep 30秒后，再继续下一轮。

		sec = zbx_time() - sec;

		zbx_setproctitle("%s [sent alerts: %d success, %d fail in " ZBX_FS_DBL " sec, idle %d sec]",
				get_process_type_string(process_type), alerts_success, alerts_fail, sec,
				CONFIG_SENDER_FREQUENCY);

		zbx_sleep_loop(CONFIG_SENDER_FREQUENCY);

通过server.c确定zabbix 每次alert间隔的时间了。

[xiaorui@devops zabbix-2.4.2 ] $grep 'CONFIG_SENDER_FREQUENCY' src/zabbix_server/server.c int CONFIG_SENDER_FREQUENCY = 30; [xiaorui@devops zabbix-2.4.2 ]$

我先前一直好奇，他报警的时候，是不是串行的，先前也看了官方的介绍说是串行执行的。奇怪了看他的函数定义的地方，发现有多线程的逻辑，不知道为啥没有应用上，而是用while一直遍历数据库返回的列表数据。

有时间把zabbix的有关alert的代码做个patch，真的很想知道，把zabbix做成http报警后，在报警多的时候，会不会能力很突出。希望我的这篇文章，能对那些做zabbix二次开发的有所帮助。

大家觉得文章对你有些作用！如果想赏钱，可以用微信扫描下面的二维码，感谢!
另外再次标注博客原地址 xiaorui.cc

跟我一起看zabbix源码之zabbix alerter.c报警逻辑

1 Response

发表评论取消回复

1 Response

发表评论 取消回复

发表评论取消回复