# HG changeset patch # User Jonathan Davies # Date 1288100551 -3600 # Node ID 48abfd03ac2c87bdaec712e1921b4234273a8247 # Parent 42c273981f8bd474df729e8dfd9e73ad6a66f6a0 CA-42914, SCTX-434: Increase the timeout for writing the database to the redo-log Previously, we had a flat timeout of 2 seconds for all redo-log operations. However, this has been shown to be too impatient for writing large databases over slow connections. (This resulted in timeouts firing prematurely, resulting in lots of METADATA_LUN_BROKEN alerts, and the redo-log being entirely useless for large pools as the database would never be successfully written!) The timeout for each operation can now be specified independently. The new default for database writes is 30 seconds. Signed-off-by: Jonathan Davies diff -r 42c273981f8b -r 48abfd03ac2c ocaml/database/block_device_io.ml --- a/ocaml/database/block_device_io.ml Tue Oct 26 14:42:30 2010 +0100 +++ b/ocaml/database/block_device_io.ml Tue Oct 26 14:42:31 2010 +0100 @@ -781,16 +781,16 @@ let str = String.make command_size '\000' in Unixext.really_read client str 0 command_size; + (* Note: none of the action functions throw any exceptions; they report errors directly to the client. *) + let (action_fn, block_time) = match str with + | "writedelta" -> action_writedelta, Xapi_globs.redo_log_max_block_time_writedelta + | "writedb___" -> action_writedb, Xapi_globs.redo_log_max_block_time_writedb + | "read______" -> action_read, Xapi_globs.redo_log_max_block_time_read + | "empty_____" -> action_empty, Xapi_globs.redo_log_max_block_time_empty + | _ -> (fun _ _ _ _ -> send_failure client (str^"|nack") ("Unknown command "^str)), 0. + in (* "Start the clock!" -- set the latest time by which we need to have responded to the client. *) - let target_response_time = Unix.gettimeofday() +. Xapi_globs.redo_log_max_block_time in - (* Note: none of the action functions throw any exceptions; they report errors directly to the client. *) - let action_fn = match str with - | "writedelta" -> action_writedelta - | "writedb___" -> action_writedb - | "read______" -> action_read - | "empty_____" -> action_empty - | _ -> (fun _ _ _ _ -> send_failure client (str^"|nack") ("Unknown command "^str)) - in + let target_response_time = Unix.gettimeofday() +. block_time in action_fn block_dev_fd client !datasock target_response_time with (* this must be an exception in Unixext.really_read because action_fn doesn't throw exceptions *) | End_of_file -> diff -r 42c273981f8b -r 48abfd03ac2c ocaml/database/redo_log.ml --- a/ocaml/database/redo_log.ml Tue Oct 26 14:42:30 2010 +0100 +++ b/ocaml/database/redo_log.ml Tue Oct 26 14:42:31 2010 +0100 @@ -176,9 +176,9 @@ In practise, this should never happen since whenever a writedelta fails, we attempt a writedb. *) let stop_at_missing_record = true -let get_latest_response_time () = +let get_latest_response_time block_time = let now = Unix.gettimeofday() in - now +. Xapi_globs.redo_log_max_block_time + now +. block_time (* Returns the PID of the process *) let start_io_process block_dev ctrlsockpath datasockpath = @@ -293,7 +293,7 @@ let action_empty sock datasockpath = R.debug "Performing empty"; (* Compute desired response time *) - let latest_response_time = get_latest_response_time() in + let latest_response_time = get_latest_response_time Xapi_globs.redo_log_max_block_time_empty in (* Empty *) let str = "empty_____" in Unixext.time_limited_write sock (String.length str) str latest_response_time; @@ -313,7 +313,7 @@ let action_read fn_db fn_delta sock datasockpath = R.debug "Performing read"; (* Compute desired response time *) - let latest_response_time = get_latest_response_time() in + let latest_response_time = get_latest_response_time Xapi_globs.redo_log_max_block_time_read in (* Write *) let str = "read______" in Unixext.time_limited_write sock (String.length str) str latest_response_time; @@ -323,7 +323,7 @@ let action_write_db marker generation_count write_fn sock datasockpath = R.debug "Performing writedb (generation %Ld)" generation_count; (* Compute desired response time *) - let latest_response_time = get_latest_response_time() in + let latest_response_time = get_latest_response_time Xapi_globs.redo_log_max_block_time_writedb in (* Send write command down control channel *) let str = Printf.sprintf "writedb___|%s|%016Ld" marker generation_count in Unixext.time_limited_write sock (String.length str) str latest_response_time; @@ -363,7 +363,7 @@ let action_write_delta marker generation_count data flush_db_fn sock datasockpath = R.debug "Performing writedelta (generation %Ld)" generation_count; (* Compute desired response time *) - let latest_response_time = get_latest_response_time() in + let latest_response_time = get_latest_response_time Xapi_globs.redo_log_max_block_time_writedelta in (* Write *) let str = Printf.sprintf "writedelta|%s|%016Ld|%016d|%s" marker generation_count (String.length data) data in Unixext.time_limited_write sock (String.length str) str latest_response_time; @@ -526,7 +526,7 @@ match !sock with | Some _ -> () (* We're already connected *) | None -> - let latest_connect_time = get_latest_response_time() in + let latest_connect_time = get_latest_response_time Xapi_globs.redo_log_max_startup_time in (* Now connect to the process via the socket *) let s = connect ctrlsockpath latest_connect_time in diff -r 42c273981f8b -r 48abfd03ac2c ocaml/xapi/xapi_globs.ml --- a/ocaml/xapi/xapi_globs.ml Tue Oct 26 14:42:30 2010 +0100 +++ b/ocaml/xapi/xapi_globs.ml Tue Oct 26 14:42:31 2010 +0100 @@ -495,10 +495,19 @@ (** {3 Settings related to the connection to the block device I/O process} *) -(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died *) -let redo_log_max_block_time = 2. +(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died while emptying *) +let redo_log_max_block_time_empty = 2. -(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process while initially connecting to it before assuming that it has died *) +(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died while reading *) +let redo_log_max_block_time_read = 30. + +(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died while writing a delta *) +let redo_log_max_block_time_writedelta = 2. + +(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died while writing a database *) +let redo_log_max_block_time_writedb = 30. + +(** The maximum time, in seconds, for which we are prepared to wait for a response from the block device I/O process before assuming that it has died while initially connecting to it *) let redo_log_max_startup_time = 5. (** The delay between each attempt to connect to the block device I/O process *)