MySQL · 2022-05-05 1

一则数据库Hang问题排查

案发现场

测试环境突然收到数据库无法连接的告警,连接上去排查问题,发现此时新的会话已经无法连接,紧急执行 pstack pidof mysqld > 3306.pstack ,然后将MySQL进行了重启。

数据库版本为percona server 5.7.22

证据分析

由于线上有对应的快照脚本,当时数据库已经无法接收新建连接时,但是我们的快照脚本在内部已经将案发现场的偷偷保存下来了,因此有了以下证据。

processlist:

[       {
        "Id":1163,
        "User":"cacti",
        "Host":"127.0.0.1:15580",
        "db":null,
        "Command":"Query",
        "Time":4,
        "State":"executing",
        "Info":"show global status where Variable_name in ('Threads_running', 'Threads_connected', 'Com_kill', 'Slow_queries')",
        "Rows_sent":0,
        "Rows_examined":0
        },

        {
        "Id":5035,
        "User":"mysql_monitor",
        "Host":"10.215.20.11:63262",
        "db":null,
        "Command":"Query",
        "Time":4,
        "State":"executing",
        "Info":"SHOW GLOBAL VARIABLES",
        "Rows_sent":0,
        "Rows_examined":0
    }....]

经过分析processlist,发现大部分SQL都是正常的查询语句,这两句SQL比较可疑,因为之前也碰到过类似的数据库hang,因此第一时间怀疑这两个SQL。

经过对pstack文件分析,我们找到了当时的堆栈

show global status

Thread 6 (Thread 0x7fba1698c700 (LWP 25258)):
#0  0x00007fba2427454d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007fba2426fed1 in _L_lock_1093 () from /lib64/libpthread.so.0
#2  0x00007fba2426fe72 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x0000000000cf549b in show_status_array(THD*, char const*, st_mysql_show_var*, enum_var_type, system_status_var*, char const*, TABLE_LIST*, bool, Item*) ()   # mysql_mutex_lock(&LOCK_global_system_variables);
#4  0x0000000000cf5644 in show_status_array(THD*, char const*, st_mysql_show_var*, enum_var_type, system_status_var*, char const*, TABLE_LIST*, bool, Item*) ()
#5  0x0000000000cfc7a0 in fill_status(THD*, TABLE_LIST*, Item*) ()
#6  0x0000000000ce8bdd in do_fill_table(THD*, TABLE_LIST*, QEP_TAB*) ()
#7  0x0000000000cfb62c in get_schema_tables_result(JOIN*, enum_schema_table_state) ()
#8  0x0000000000cdea7d in JOIN::prepare_result() ()
#9  0x0000000000c70570 in JOIN::exec() ()
#10 0x0000000000cdf3cd in handle_query(THD*, LEX*, Query_result*, unsigned long long, unsigned long long) ()
#11 0x000000000076a707 in execute_sqlcom_select(THD*, TABLE_LIST*) ()
#12 0x0000000000c9f698 in mysql_execute_command(THD*, bool) ()
#13 0x0000000000ca5afd in mysql_parse(THD*, Parser_state*) ()
#14 0x0000000000ca666c in dispatch_command(THD*, COM_DATA const*, enum_server_command) ()
#15 0x0000000000ca7f97 in do_command(THD*) ()
#16 0x0000000000d4c147 in threadpool_process_request(THD*) ()
#17 0x0000000000d5b25e in worker_main(void*) ()
#18 0x0000000000edb9b1 in pfs_spawn_thread ()
#19 0x00007fba2426dea5 in start_thread () from /lib64/libpthread.so.0
#20 0x00007fba22d1b9fd in clone () from /lib64/libc.so.6

# SHOW GLOBAL VARIABLES  

Thread 9 (Thread 0x7fba24734700 (LWP 25249)):
#0  0x00007fba2427454d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007fba2426fe9b in _L_lock_883 () from /lib64/libpthread.so.0
#2  0x00007fba2426fd68 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x0000000000d3180f in PolyLock_lock_log::rdlock() ()   
#4  0x0000000000c17695 in sys_var::value_ptr(THD*, THD*, enum_var_type, st_mysql_lex_string*) ()  
#5  0x0000000000cf01cc in get_one_variable_ext(THD*, THD*, st_mysql_show_var const*, enum_var_type, enum_mysql_show_type, system_status_var*, charset_info_st const**, char*, unsigned long*) ()
#6  0x0000000000cf0231 in get_one_variable(THD*, st_mysql_show_var const*, enum_var_type, enum_mysql_show_type, system_status_var*, charset_info_st const**, char*, unsigned long*) ()
#7  0x0000000000cf54fc in show_status_array(THD*, char const*, st_mysql_show_var*, enum_var_type, system_status_var*, char const*, TABLE_LIST*, bool, Item*) ()  ### mysql_mutex_lock(&LOCK_global_system_variables)
#8  0x0000000000cfae87 in fill_variables(THD*, TABLE_LIST*, Item*) ()  
#9  0x0000000000ce8bdd in do_fill_table(THD*, TABLE_LIST*, QEP_TAB*) ()
#10 0x0000000000cfb62c in get_schema_tables_result(JOIN*, enum_schema_table_state) ()
#11 0x0000000000cdea7d in JOIN::prepare_result() ()
#12 0x0000000000c70570 in JOIN::exec() ()
#13 0x0000000000cdf3cd in handle_query(THD*, LEX*, Query_result*, unsigned long long, unsigned long long) ()
#14 0x000000000076a707 in execute_sqlcom_select(THD*, TABLE_LIST*) ()
#15 0x0000000000ca21e4 in mysql_execute_command(THD*, bool) ()
#16 0x0000000000ca5afd in mysql_parse(THD*, Parser_state*) ()
#17 0x0000000000ca666c in dispatch_command(THD*, COM_DATA const*, enum_server_command) ()
#18 0x0000000000ca7f97 in do_command(THD*) ()
#19 0x0000000000d4c147 in threadpool_process_request(THD*) ()
#20 0x0000000000d5b25e in worker_main(void*) ()
#21 0x0000000000edb9b1 in pfs_spawn_thread ()
#22 0x00007fba2426dea5 in start_thread () from /lib64/libpthread.so.0
#23 0x00007fba22d1b9fd in clone () from /lib64/libc.so.6

经过代码分析,我们找到了这两个会话持有的mutex和等待的mutex

show global status

================================================

int fill_status(THD *thd, TABLE_LIST *tables, Item *cond)
{
  ...........
  if (thd->fill_status_recursion_level++ == 0) 
    mysql_mutex_lock(&LOCK_status);
    # 持有 LOCK_status 
  if (option_type == OPT_GLOBAL)
    calc_sum_of_all_status(status_var_ptr);
  // Push an empty tail element
  all_status_vars.push_back(st_mysql_show_var());
  res= show_status_array(thd, wild,
                         &all_status_vars[0],
                         option_type, status_var_ptr, "", tables,
                         upper_case_names, cond);
  all_status_vars.pop_back(); // Pop the empty element.

  if (thd->fill_status_recursion_level-- == 1) 
    mysql_mutex_unlock(&LOCK_status);
    # 释放 LOCK_status 

  DEBUG_SYNC(thd, "after_preparing_global_status_array");
  DBUG_RETURN(res);
}


================================================

static bool show_status_array(THD *thd, const char *wild,
                              SHOW_VAR *variables,
                              enum enum_var_type value_type,
                              struct system_status_var *status_var,
                              const char *prefix, TABLE_LIST *tl,
                              bool ucase_names,
                              Item *cond)
{
  .........
    if (show_type == SHOW_ARRAY)
    {
      show_status_array(thd, wild, (SHOW_VAR *) var->value, value_type,
                        status_var, name_buffer, tl, ucase_names, partial_cond);
    }
    else
    {
      if (!(wild && wild[0] && wild_case_compare(system_charset_info,
                                                 name_buffer, wild)) &&
          (!partial_cond || partial_cond->val_int()))
      {
        const char *pos;
        size_t length;

        mysql_mutex_lock(&LOCK_global_system_variables);
        # 等待 LOCK_global_system_variables

        pos= get_one_variable(thd, var, value_type, show_type, status_var,
                              &charset, buff, &length);
        table->field[1]->store(pos, (uint32) length, charset);
        thd->count_cuted_fields= CHECK_FIELD_IGNORE;
        table->field[1]->set_notnull();
        mysql_mutex_unlock(&LOCK_global_system_variables);

       ...........
}


SHOW GLOBAL VARIABLES  


================================================

static bool show_status_array(THD *thd, const char *wild,
                              SHOW_VAR *variables,
                              enum enum_var_type value_type,
                              struct system_status_var *status_var,
                              const char *prefix, TABLE_LIST *tl,
                              bool ucase_names,
                              Item *cond)
{
  .........
    if (show_type == SHOW_ARRAY)
    {
      show_status_array(thd, wild, (SHOW_VAR *) var->value, value_type,
                        status_var, name_buffer, tl, ucase_names, partial_cond);
    }
    else
    {
      if (!(wild && wild[0] && wild_case_compare(system_charset_info,
                                                 name_buffer, wild)) &&
          (!partial_cond || partial_cond->val_int()))
      {
        const char *pos;
        size_t length;

        mysql_mutex_lock(&LOCK_global_system_variables);
        #  持有 LOCK_global_system_variables

        pos= get_one_variable(thd, var, value_type, show_type, status_var,
                              &charset, buff, &length);
        table->field[1]->store(pos, (uint32) length, charset);
        thd->count_cuted_fields= CHECK_FIELD_IGNORE;
        table->field[1]->set_notnull();
        mysql_mutex_unlock(&LOCK_global_system_variables);

       ...........
}

================================================


void PolyLock_lock_log::rdlock()
{
  mysql_mutex_lock(mysql_bin_log.get_log_lock());
}

================================================

inline mysql_mutex_t* get_log_lock() { return &LOCK_log; }
# 等待 LOCK_log


此时我们发现,show global status 会话是先申请的LOCK_status,然后再申请 LOCK_global_system_variables。而SHOW GLOBAL VARIABLES 是先申请的LOCK_global_system_variables,然后再申请LOCK_log,此时他们两个并没有陷入线程死锁。

案件进一步分析

经过对pstack 的文件进一步分析,我们发现了新的证据,此堆栈为将执行Binlog的ordered_commit的

Thread 283 (Thread 0x7f80d0443700 (LWP 5621)):
#0  0x00007f83c362f54d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x00007f83c362aed1 in _L_lock_1093 () from /lib64/libpthread.so.0
#2  0x00007f83c362ae72 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x0000000000e589b0 in MYSQL_BIN_LOG::publish_coordinates_for_global_status() const ()
#4  0x0000000000e5ea5e in MYSQL_BIN_LOG::ordered_commit(THD*, bool, bool) ()
#5  0x0000000000e61d62 in MYSQL_BIN_LOG::commit(THD*, bool) ()
#6  0x0000000000806cb1 in ha_commit_trans(THD*, bool, bool) ()
#7  0x0000000000d4c86e in trans_commit_stmt(THD*) ()
#8  0x0000000000c9e6fa in mysql_execute_command(THD*, bool) ()
#9  0x0000000000ca5afd in mysql_parse(THD*, Parser_state*) ()
#10 0x0000000000ca666c in dispatch_command(THD*, COM_DATA const*, enum_server_command) ()
#11 0x0000000000ca7f97 in do_command(THD*) ()
#12 0x0000000000d4c147 in threadpool_process_request(THD*) ()
#13 0x0000000000d5b25e in worker_main(void*) ()
#14 0x0000000000edb9b1 in pfs_spawn_thread ()
#15 0x00007f83c3628ea5 in start_thread () from /lib64/libpthread.so.0
#16 0x00007f83c20d69fd in clone () from /lib64/libc.so.6

分析发现该会话publish_coordinates_for_global_status 需要申请LOCK_status的mutex,而ordered_commit 刚好持有了LOCK_log,具体代码如下

================================================

int MYSQL_BIN_LOG::ordered_commit(THD *thd, bool all, bool skip_commit)
{
  ......
  if (change_stage(thd, Stage_manager::FLUSH_STAGE, thd, NULL, &LOCK_log))   
  # 持有 LOCK_log
  {
    DBUG_PRINT("return", ("Thread ID: %u, commit_error: %d",
                          thd->thread_id(), thd->commit_error));
    DBUG_RETURN(finish_commit(thd));
  }


  publish_coordinates_for_global_status();

  ........
}

================================================

void MYSQL_BIN_LOG::publish_coordinates_for_global_status(void) const
{
  mysql_mutex_assert_owner(&LOCK_log);

  mysql_mutex_lock(&LOCK_status);                         
  # 等待 LOCK_status
  strcpy(binlog_global_snapshot_file, log_file_name);
  binlog_global_snapshot_position=
      my_b_inited(&log_file) ? my_b_tell(&log_file) : 0;
  mysql_mutex_unlock(&LOCK_status);
}


案件根因确认

因此我们确认了一个三者循环等待的锁关系,锁等待关系如下,使用mysqlslap 进行测试,也能复现。

分别开启三个会话,执行

Sess1:
mysqlslap  -uroot  -S /srv/my3321/run/mysql.sock -c 1 --create-schema=test --number-of-queries=10000000 -q "SHOW GLOBAL VARIABLES"

Sess2:
mysqlslap  -uroot  -S /srv/my3321/run/mysql.sock -c 1 --create-schema=test --number-of-queries=10000000 -q "show global status where Variable_name in ('Threads_running', 'Threads_connected', 'Com_kill', 'Slow_queries') "

Sess3:
Insert xxx ; commit;

然后数据库就Hang 了

问题总结:

该问题并不是在5.7.22 下一定会出现,我们的环境开启了 show_compatibility_56,关闭此参数的情况下,

SHOW GLOBAL VARIABLES 会从 Performance Schema global_variables table里面获取,SHOW GLOBAL STATUS 的结果是 Performance Schema global_status 表, 和 Com_xxx 计数器的合集,测试发现,没有发现数据库Hang的情况,确认代码层面,mutex的申请情况也少了很多,后期我们将会关闭该参数,降低因为内部mutex 获取顺序不一致导致的内部线程死锁的问题