/* ** Copyright (C) 2001-2025 Zabbix SIA ** ** This program is free software: you can redistribute it and/or modify it under the terms of ** the GNU Affero General Public License as published by the Free Software Foundation, version 3. ** ** This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; ** without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ** See the GNU Affero General Public License for more details. ** ** You should have received a copy of the GNU Affero General Public License along with this program. ** If not, see <https://www.gnu.org/licenses/>. **/ #include "zbxstr.h" #include "zbxlog.h" #include "zbxhttp.h" #include "zbxtypes.h" #include "zbxalgo.h" #include "zbxexpr.h" static int str_loc_cmp(const char *src, const zbx_strloc_t *loc, const char *text, size_t text_len) { ZBX_RETURN_IF_NOT_EQUAL(loc->r - loc->l + 1, text_len); return zbx_strncasecmp(src + loc->l, text, text_len); } static char *str_loc_dup(const char *src, const zbx_strloc_t *loc) { char *str; size_t len; len = loc->r - loc->l + 1; str = zbx_malloc(NULL, len + 1); memcpy(str, src + loc->l, len); str[len] = '\0'; return str; } #define ZBX_ATTRIBUTE_NAME_CHARLIST " \"'=<>`/" static int parse_attribute_name(const char *data, size_t pos, zbx_strloc_t *loc) { const char *ptr = data + pos; if (NULL != strchr(ZBX_ATTRIBUTE_NAME_CHARLIST, *ptr)) return FAIL; while (NULL == strchr(ZBX_ATTRIBUTE_NAME_CHARLIST, *(++ptr))) ; loc->l = pos; loc->r = (size_t)(ptr - data) - 1; return SUCCEED; } #undef ZBX_ATTRIBUTE_NAME_CHARLIST static size_t skip_spaces(const char *data, size_t pos) { while (' ' == data[pos] || '\t' == data[pos]) pos++; return pos; } static int parse_attribute_op(const char *data, size_t pos, zbx_strloc_t *loc) { if ('=' == data[pos]) { loc->l = pos; loc->r = pos; return SUCCEED; } return FAIL; } #define ZBX_UNQUOTED_ATTRIBUTE_VALUE_CHARLIST " \"'=<>`" static int parse_attribute_value(const char *data, size_t pos, zbx_strloc_t *loc) { const char *ptr; char *charlist; unsigned char quoted; ptr = data + pos; if ('"' == *ptr) { charlist = "\""; quoted = 1; } else if ('\'' == *ptr) { charlist = "'"; quoted = 1; } else if (NULL == strchr(ZBX_UNQUOTED_ATTRIBUTE_VALUE_CHARLIST, *ptr)) { quoted = 0; charlist = ZBX_UNQUOTED_ATTRIBUTE_VALUE_CHARLIST; } else return FAIL; loc->l = pos; while (NULL == strchr(charlist, *(++ptr))) ; if (1 == quoted) { if ('\0' == *ptr) return FAIL; loc->r = (size_t)(ptr - data); } else loc->r = (size_t)(ptr - data) - 1; return SUCCEED; } #undef ZBX_UNQUOTED_ATTRIBUTE_VALUE_CHARLIST static int parse_attribute_name_value(const char *data, size_t pos, zbx_strloc_t *loc_name, zbx_strloc_t *loc_value) { zbx_strloc_t loc_op; if (SUCCEED != parse_attribute_name(data, pos, loc_name)) return FAIL; pos = skip_spaces(data, loc_name->r + 1); if (SUCCEED != parse_attribute_op(data, pos, &loc_op)) { *loc_value = *loc_name; return SUCCEED; } pos = skip_spaces(data, loc_op.r + 1); if (SUCCEED != parse_attribute_value(data, pos, loc_value)) return FAIL; return SUCCEED; } static size_t parse_html_attributes(const char *data, char **content, char **charset) { size_t pos = 0; zbx_strloc_t loc_name, loc_value, loc_content; int http_equiv_content_found = 0, content_found = 0; pos = skip_spaces(data, pos); while (1) { if (FAIL == parse_attribute_name_value(data, pos, &loc_name, &loc_value)) break; pos = skip_spaces(data, loc_value.r + 1); if (0 == str_loc_cmp(data, &loc_name, "http-equiv", ZBX_CONST_STRLEN("http-equiv"))) { if (0 == str_loc_cmp(data, &loc_value, "\"content-type\"", ZBX_CONST_STRLEN("\"content-type\"")) || 0 == str_loc_cmp(data, &loc_value, "content-type", ZBX_CONST_STRLEN("content-type")) || 0 == str_loc_cmp(data, &loc_value, "'content-type'", ZBX_CONST_STRLEN("'content-type'"))) { http_equiv_content_found = 1; } } else if (0 == str_loc_cmp(data, &loc_name, "content", ZBX_CONST_STRLEN("content"))) { loc_content = loc_value; content_found = 1; } else if (0 == str_loc_cmp(data, &loc_name, "charset", ZBX_CONST_STRLEN("charset"))) { *charset = str_loc_dup(data, &loc_value); zbx_lrtrim(*charset, " \"'"); return pos; } } if (1 == http_equiv_content_found && 1 == content_found) { *content = str_loc_dup(data, &loc_content); zbx_lrtrim(*content, " \"'"); } return pos; } static void html_get_charset_content(const char *data, char **charset, char **content) { while (NULL == *charset && NULL == *content && NULL != (data = strstr(data, "<meta"))) { data += ZBX_CONST_STRLEN("<meta"); data += parse_html_attributes(data, content, charset); } } #define ZBX_TSPECIALS "()<>@,;:\"/[]?=" #define ZBX_CONTENT_TOKEN_CHARLIST ZBX_TSPECIALS " \r\n" static int parse_content_name(const char *data, size_t pos, zbx_strloc_t *loc) { const char *ptr = data + pos; if (NULL != strchr(ZBX_CONTENT_TOKEN_CHARLIST, *ptr)) return FAIL; while (NULL == strchr(ZBX_CONTENT_TOKEN_CHARLIST, *(++ptr))) ; loc->l = pos; loc->r = (size_t)(ptr - data) - 1; return SUCCEED; } static int parse_content_op(const char *data, size_t pos, zbx_strloc_t *loc) { if ('=' == data[pos]) { loc->l = pos; loc->r = pos; return SUCCEED; } return FAIL; } static int parse_quoted_content_value(const char *data, size_t pos, zbx_strloc_t *loc) { const char *ptr; ptr = data + pos; loc->l = pos; while ('"' != *(++ptr)) { if ('\\' == *ptr) { ptr++; if ('\\' != *ptr && 'n' != *ptr && '"' != *ptr) return FAIL; continue; } if ('\0' == *ptr) return FAIL; } loc->r = (size_t)(ptr - data); return SUCCEED; } static int parse_content_value(const char *data, size_t pos, zbx_strloc_t *loc) { const char *ptr; ptr = data + pos; if ('"' == *ptr) return parse_quoted_content_value(data, pos, loc); else if (NULL != strchr(ZBX_CONTENT_TOKEN_CHARLIST, *ptr)) return FAIL; loc->l = pos; while (NULL == strchr(ZBX_CONTENT_TOKEN_CHARLIST, *(++ptr))) ; loc->r = (size_t)(ptr - data) - 1; return SUCCEED; } #undef ZBX_CONTENT_TOKEN_CHARLIST #undef ZBX_TSPECIALS static int parse_content_key_value(const char *data, size_t pos, zbx_strloc_t *loc_name, zbx_strloc_t *loc_value) { zbx_strloc_t loc_op; if (SUCCEED != parse_content_name(data, pos, loc_name)) return FAIL; pos = skip_spaces(data, loc_name->r + 1); if (SUCCEED != parse_content_op(data, pos, &loc_op)) return FAIL; pos = skip_spaces(data, loc_op.r + 1); if (SUCCEED != parse_content_value(data, pos, loc_value)) return FAIL; return SUCCEED; } static char *str_loc_unquote_dyn(const char *src, const zbx_strloc_t *loc) { char *str, *ptr; src += loc->l + 1; str = ptr = zbx_malloc(NULL, loc->r - loc->l); while ('"' != *src) { if ('\\' == *src) { switch (*(++src)) { case '\\': *ptr++ = '\\'; break; case 'n': *ptr++ = '\n'; break; case '"': *ptr++ = '"'; break; } } else *ptr++ = *src; src++; } *ptr = '\0'; return str; } static char *parse_content(const char *data) { size_t pos = 0; zbx_strloc_t loc_name, loc_value; pos = skip_spaces(data, pos); while (1) { if (FAIL == parse_content_key_value(data, pos, &loc_name, &loc_value)) break; pos = skip_spaces(data, loc_value.r + 1); if (0 == str_loc_cmp(data, &loc_name, "charset", ZBX_CONST_STRLEN("charset"))) { if ('"' == *(data + loc_value.l)) return str_loc_unquote_dyn(data, &loc_value); return str_loc_dup(data, &loc_value); } } return NULL; } char *zbx_determine_charset(const char *content_type, char *body, size_t len) { const char *ptr; char *charset = NULL, *content = NULL; if (NULL != content_type) { if (NULL != (ptr = strchr(content_type, ';'))) charset = parse_content(ptr + 1); } if (NULL == charset && 0 == len) charset = zbx_strdup(NULL, "UTF-8"); html_get_charset_content(body, &charset, &content); if (NULL != content && NULL == charset) { if (NULL != (ptr = strchr(content, ';'))) charset = parse_content(ptr + 1); } zbx_free(content); if (NULL == charset) { const char *bom_encoding = zbx_get_bom_econding(body, len); if ('\0' != *bom_encoding) charset = zbx_strdup(NULL, bom_encoding); else if (SUCCEED == zbx_is_utf8(body)) charset = zbx_strdup(NULL, "UTF-8"); else charset = zbx_strdup(NULL, "WINDOWS-1252"); } zbx_lrtrim(charset, " "); zbx_strupper(charset); return charset; }