# HG changeset patch
# User David Scott <dave.scott@xxxxxxxxxxxxx>
# Date 1260486290 0
# Node ID 9edc8c86f01dd5e951b944b5424e5b9383d0780c
# Parent ca92f46da128588874c6c660aef6409adae119dd
CA-33707: fix queueing deadlocking bug by never entering a queue with a lock
held.
We now request domain shutdown without the per-VM mutex held. This means these
may race with the background event thread performing a domain destruction.
The domain destruction/recreation part of {clean,hard}_{shutdown,reboot} is
placed in the domU_shutdown_queue by both the synchronous API path and the
event thread.
We remove the Vmops.Domain_shutdown_for_wrong_reason exception and replace it
with VM_{CRASHED,REBOOTED,HALTED} exceptions. We may yet be able to remove
these "errors" completely.
Work around many instances of VM_FAILED_SHUTDOWN_ACK by reissuing the shutdown
request every few seconds until the timeout expires.
If VM.{clean,hard}_reboot runs in parallel with an internal domain reboot then
only one reboot will probably happen.
If VM.{clean,hard}_shutdown runs in parallel with an internal domain reboot
then up to 10 retries to shut the VM down will be attempted.
Add FIST points to:
1. disable the event thread's handling of @releaseDomain
2. disable the synchronous API calls handling of domain destruction/recreation
3. disable the artificial VM reboot delay
4. simulate an internal shutdown (via Xc.domain_shutdown)
Add a series of tests to quicktest which run every combination of
* VM.{clean,hard}_{shutdown,reboot}
* parallel internal halt,reboot,crash
* synchronous thread only, event thread only, both
Signed-off-by: David Scott <dave.scott@xxxxxxxxxxxxx>
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/api_errors.ml
--- a/ocaml/idl/api_errors.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/api_errors.ml Thu Dec 10 23:04:50 2009 +0000
@@ -136,6 +136,9 @@
let vm_shutdown_timeout = "VM_SHUTDOWN_TIMEOUT"
let vm_duplicate_vbd_device = "VM_DUPLICATE_VBD_DEVICE"
let vm_not_resident_here = "VM_NOT_RESIDENT_HERE"
+let vm_crashed = "VM_CRASHED"
+let vm_rebooted = "VM_REBOOTED"
+let vm_halted = "VM_HALTED"
let vms_failed_to_cooperate = "VMS_FAILED_TO_COOPERATE"
let domain_exists = "DOMAIN_EXISTS"
let cannot_reset_control_domain = "CANNOT_RESET_CONTROL_DOMAIN"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/datamodel.ml
--- a/ocaml/idl/datamodel.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/datamodel.ml Thu Dec 10 23:04:50 2009 +0000
@@ -618,6 +618,12 @@
~doc:"VM didn't acknowledge the need to shutdown." ();
error Api_errors.vm_shutdown_timeout [ "vm"; "timeout" ]
~doc:"VM failed to shutdown before the timeout expired" ();
+ error Api_errors.vm_crashed [ "vm" ]
+ ~doc:"The VM crashed" ();
+ error Api_errors.vm_rebooted [ "vm" ]
+ ~doc:"The VM unexpectedly rebooted" ();
+ error Api_errors.vm_halted [ "vm" ]
+ ~doc:"The VM unexpectedly halted" ();
error Api_errors.bootloader_failed [ "vm"; "msg" ]
~doc:"The bootloader returned an error" ();
error Api_errors.unknown_bootloader [ "vm"; "bootloader" ]
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/OMakefile
--- a/ocaml/xapi/OMakefile Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/OMakefile Thu Dec 10 23:04:50 2009 +0000
@@ -48,7 +48,7 @@
#OCamlProgram(upload_receive, $(COMMON) fileupload upload_receive)
-OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints
../util/vm_memory_constraints)
+OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints
../util/vm_memory_constraints quicktest_lifecycle)
OCamlProgram(stresstest, stresstest)
OCamlProgram(fakeguestagent, fakeguestagent)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/events.ml
--- a/ocaml/xapi/events.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/events.ml Thu Dec 10 23:04:50 2009 +0000
@@ -121,15 +121,19 @@
(try Db.VM.remove_from_other_config ~__context ~self:vm
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ());
Db.VM.add_to_other_config ~__context ~self:vm
~key:Xapi_globs.last_artificial_reboot_delay_key ~value:(string_of_int 2);
0 in
- debug "Adding artificial delay on reboot for VM: %s. delay time=%d
seconds" (Ref.string_of vm) delay;
- Thread.delay (float_of_int delay)
-
+ if Xapi_fist.disable_reboot_delay ()
+ then debug "FIST: disable_reboot_delay"
+ else begin
+ debug "Adding artificial delay on reboot for VM: %s. delay time=%d
seconds" (Ref.string_of vm) delay;
+ Thread.delay (float_of_int delay)
+ end
+
let clear_reboot_delay ~__context ~vm =
try Db.VM.remove_from_other_config ~__context ~self:vm
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ()
let perform_destroy ~__context ~vm token =
TaskHelper.set_description ~__context "destroy";
- Xapi_vm.Shutdown.in_dom0 { Xapi_vm.TwoPhase.__context = __context; vm=vm;
token=Some token; api_call_name="destroy"; clean=false };
+ Xapi_vm.Shutdown.in_dom0_already_locked { Xapi_vm.TwoPhase.__context =
__context; vm=vm; api_call_name="destroy"; clean=false };
update_allowed_ops_using_api ~__context vm
let perform_preserve ~__context ~vm token =
@@ -523,8 +527,12 @@
let action_taken = Resync.vm ~__context token vm in
if action_taken then debug "Action was taken so
allowed_operations should be updated";
in
- debug "adding Resync.vm to work queue";
- push vm Local_work_queue.domU_internal_shutdown_queue description
work_item;
+ if Xapi_fist.disable_event_lifecycle_path ()
+ then warn "FIST: disable_event_lifecycle_path: skipping
Resync.vm"
+ else begin
+ debug "adding Resync.vm to work queue";
+ push vm Local_work_queue.domU_internal_shutdown_queue
description work_item;
+ end
)
with Vm_corresponding_to_domid_not_in_db domid ->
error "event could not be processed because VM record not in database"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest.ml
--- a/ocaml/xapi/quicktest.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/quicktest.ml Thu Dec 10 23:04:50 2009 +0000
@@ -619,6 +619,7 @@
end;
vbd_pause_unpause_test s debian;
powercycle_test s debian;
+ Quicktest_lifecycle.test s debian;
vm_uninstall test s debian;
success test
with Unable_to_find_suitable_debian_template ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest_lifecycle.ml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ocaml/xapi/quicktest_lifecycle.ml Thu Dec 10 23:04:50 2009 +0000
@@ -0,0 +1,194 @@
+
+
+type 'a api_call =
+ | Shutdown of 'a
+ | Reboot of 'a
+
+type api_mode =
+ | Clean
+ | Hard
+
+type api = api_mode api_call
+
+type parallel_op =
+ | Internal_reboot
+ | Internal_halt
+ | Internal_suspend
+ | Internal_crash
+
+type code_path =
+ | Sync
+ | Event
+ | Both
+
+type result =
+ | Rebooted
+ | Halted
+
+let final_guest_state = function
+ | Shutdown _ -> Halted
+ | Reboot _ -> Rebooted
+
+type test = {
+ api: api option;
+ parallel_op: parallel_op option;
+ code_path: code_path;
+}
+
+let string_of_result = function
+ | Rebooted -> "Reboot"
+ | Halted -> "Halt"
+
+let expected_result = function
+ | { api = Some (Shutdown _); parallel_op = Some _; code_path = (Sync|Both) }
-> Some Halted
+ | { api = Some (Reboot _); parallel_op = Some _; code_path = (Sync|Both) }
-> Some Rebooted
+ | { api = Some (Shutdown _); parallel_op = None; code_path =
(Sync|Event|Both) } -> Some Halted
+ | { api = Some (Reboot _); parallel_op = None; code_path =
(Sync|Event|Both) } -> Some Rebooted
+ | { parallel_op = Some (Internal_halt | Internal_crash); code_path = Event }
-> Some Halted
+ | { parallel_op = Some Internal_reboot; code_path = Event } -> Some Rebooted
+
+ | { api = None; parallel_op = Some (Internal_halt (* | Internal_suspend *) |
Internal_crash); code_path = (Event|Both) } -> Some Halted
+ | { api = None; parallel_op = Some Internal_reboot; code_path = (Event|Both)
} -> Some Rebooted
+ | _ -> None (* invalid test *)
+
+
+let string_of_test x =
+ let string_of_api = function
+ | Shutdown Clean -> "clean_shutdown"
+ | Shutdown Hard -> "hard_shutdown "
+ | Reboot Clean -> "clean_reboot "
+ | Reboot Hard -> "hard_reboot " in
+ let string_of_parallel_op = function
+ | Internal_reboot -> "reboot "
+ | Internal_halt -> "halt "
+ | Internal_suspend -> "suspend "
+ | Internal_crash -> "crash " in
+ let string_of_code_path = function
+ | Sync -> "synch "
+ | Event -> "event "
+ | Both -> "both " in
+ let dm f x = match x with
+ | None -> "Nothing "
+ | Some x -> f x in
+ Printf.sprintf "%s %s %s -> %s"
+ (dm string_of_api x.api) (dm string_of_parallel_op x.parallel_op)
(string_of_code_path x.code_path)
+ (match expected_result x with None -> "invalid" | Some y ->
string_of_result y)
+open List
+
+let all_possible_tests =
+ let all_api_variants x =
+ [ { x with api = None };
+ { x with api = Some (Shutdown Clean) };
+ { x with api = Some (Shutdown Hard) };
+ { x with api = Some (Reboot Clean) };
+ { x with api = Some (Reboot Hard) } ] in
+ let all_parallel_op_variants x =
+ [ { x with parallel_op = None };
+ { x with parallel_op = Some Internal_reboot };
+ { x with parallel_op = Some Internal_halt };
+ { x with parallel_op = Some Internal_suspend };
+ { x with parallel_op = Some Internal_crash } ] in
+ let all_code_path_variants x =
+ [ { x with code_path = Sync };
+ { x with code_path = Event };
+ { x with code_path = Both } ] in
+
+ let xs = [ { api = None; parallel_op = None; code_path = Sync } ] in
+ concat (map all_code_path_variants (concat (map all_parallel_op_variants
(concat (map all_api_variants xs)))))
+
+let all_valid_tests = List.filter (fun t -> expected_result t <> None)
all_possible_tests
+
+ (*
+let _ =
+ List.iter print_endline (map string_of_test all_valid_tests);
+ Printf.printf "In total there are %d tests.\n" (List.length all_valid_tests)
+ *)
+
+open Quicktest_common
+open Client
+open Pervasiveext
+
+let one s debian test =
+ let t = make_test (string_of_test test) 1 in
+ start t;
+ let event = "/tmp/fist_disable_event_lifecycle_path" in
+ let sync = "/tmp/fist_disable_sync_lifecycle_path" in
+ let simulate = "/tmp/fist_simulate_internal_shutdown" in
+ let delay = "/tmp/fist_disable_reboot_delay" in
+
+ finally
+ (fun () ->
+ try
+ begin
+ Unixext.unlink_safe simulate;
+ Unixext.touch_file delay;
+ match test.code_path with
+ | Sync ->
+ Unixext.unlink_safe sync;
+ Unixext.touch_file event
+ | Event ->
+ Unixext.unlink_safe event;
+ Unixext.touch_file sync
+ | Both ->
+ Unixext.unlink_safe sync;
+ Unixext.unlink_safe event
+ end;
+ if Client.VM.get_power_state !rpc s debian = `Halted
+ then Client.VM.start !rpc s debian false false;
+
+ let call_api = function
+ | Shutdown Clean -> Client.VM.clean_shutdown
!rpc s debian
+ | Shutdown Hard -> Client.VM.hard_shutdown
!rpc s debian
+ | Reboot Clean -> Client.VM.clean_reboot !rpc
s debian
+ | Reboot Hard -> Client.VM.hard_reboot !rpc s
debian in
+
+ let domid = Client.VM.get_domid !rpc s debian in
+ begin match test with
+ | { api = None; parallel_op = Some x } ->
+ let reason = match x with
+ | Internal_reboot -> Xc.Reboot
+ | Internal_halt -> Xc.Halt
+ | Internal_crash -> Xc.Crash
+ | Internal_suspend -> Xc.Suspend in
+ Xc.with_intf (fun xc ->
Xc.domain_shutdown xc (Int64.to_int domid) reason)
+ | { api = Some x; parallel_op = Some y } ->
+ let reason = match y with
+ | Internal_reboot -> "reboot"
+ | Internal_halt -> "halt"
+ | Internal_crash -> "crash"
+ | Internal_suspend -> "suspend" in
+ Unixext.write_string_to_file simulate
reason;
+ call_api x
+ | { api = Some x; parallel_op = None } ->
+ call_api x
+ | t -> failwith (Printf.sprintf "Invalid test: %s"
(string_of_test t))
+ end;
+
+ let wait_for_domid p =
+ let start = Unix.gettimeofday () in
+ let finished = ref false in
+ while Unix.gettimeofday () -. start < 300. &&
(not !finished) do
+ finished := p (Client.VM.get_domid !rpc s
debian);
+ if not !finished then Thread.delay 1.
+ done;
+ if not !finished then failwith "timeout"
+ in
+
+ begin match expected_result test with
+ | None -> failwith (Printf.sprintf "Invalid test:
%s" (string_of_test test))
+ | Some Rebooted ->
+ wait_for_domid (fun domid' -> domid <>
domid')
+ | Some Halted ->
+ wait_for_domid (fun domid' -> domid' =
-1L)
+ end
+ with e -> failed t (Printexc.to_string e)
+ )
+ (fun () ->
+ Unixext.unlink_safe sync;
+ Unixext.unlink_safe event;
+ Unixext.unlink_safe delay
+ );
+ success t
+
+let test s debian =
+ List.iter (one s debian) all_valid_tests
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/vmops.ml
--- a/ocaml/xapi/vmops.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/vmops.ml Thu Dec 10 23:04:50 2009 +0000
@@ -777,59 +777,52 @@
)
-
-let match_xal_and_shutdown xalreason reason =
- debug "Comparing XAL %s with Domain %s"
- (Xal.string_of_died_reason xalreason)
- (Domain.string_of_shutdown_reason reason);
- match xalreason, reason with
- | Xal.Crashed, _ -> false
- | Xal.Vanished, _ -> false
- | Xal.Halted, (Domain.Halt | Domain.PowerOff) -> true
- | Xal.Rebooted, Domain.Reboot -> true
- | Xal.Suspended, Domain.Suspend -> true
- | Xal.Shutdown i, Domain.Unknown i2 -> i = i2
- | _, _ -> false
-
(** Thrown if clean_shutdown_with_reason exits for the wrong reason: eg the
domain
crashed or rebooted *)
exception Domain_shutdown_for_wrong_reason of Xal.died_reason
-(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff). *)
+(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff),
waits for
+ it to shutdown (or vanish) and then return the reason.
+ Note this is not always called with the per-VM mutex. *)
let clean_shutdown_with_reason ?(at = fun _ -> ()) ~xal ~__context ~self domid
reason =
(* Set the task allowed_operations to include cancel *)
if reason <> Domain.Suspend then TaskHelper.set_cancellable ~__context;
at 0.25;
- (* Windows PV drivers will respond within 10s according to ssmith and
- improving this is likely to happen in a Rio timeframe (CA-3964). It's
- still possible (although unlikely) for us to timeout just before the
- drivers activate but the worst we'll suffer is a shutdown failure
- followed by a spontaneous shutdown (which can happen anyway). Having
- this check in here allows us to bail out quickly in the common case
- of the PV drivers being missing. *)
- with_xs (fun xs ->
- let xc = Xal.xc_of_ctx xal in
- if not (Domain.shutdown_ack ~timeout:60. ~xc ~xs domid reason) then
- raise (Api_errors.Server_error
(Api_errors.vm_failed_shutdown_ack, []))
- );
+ let xs = Xal.xs_of_ctx xal in
+ let xc = Xal.xc_of_ctx xal in
+ begin
+ (* Wait for up to 60s for the VM to acknowledge the shutdown request.
In case the guest
+ misses our original request, keep making additional ones. *)
+ let finished = ref false in
+ let timeout = 60.0 in
+ let start = Unix.gettimeofday () in
+ while Unix.gettimeofday () -. start < timeout && not !finished do
+ try
+ (* Make the shutdown request: this will fail if the domain has
vanished. *)
+ Domain.shutdown ~xs domid reason;
+ (* Wait for any necessary acknowledgement. If we get a
Watch.Timeout _ then
+ we abort early; otherwise we continue in Xal.wait_release
below. *)
+ Domain.shutdown_wait_for_ack ~timeout:10. ~xc ~xs domid reason;
+ finished := true
+ with
+ | Watch.Timeout _ -> () (* try again *)
+ | e ->
+ debug "Caught and ignoring exception: %s"
(ExnHelper.string_of_exn e);
+ log_backtrace ();
+ finished := true
+ done;
+ if not !finished then raise (Api_errors.Server_error
(Api_errors.vm_failed_shutdown_ack, []))
+ end;
at 0.50;
let total_timeout = 20. *. 60. in (* 20 minutes *)
(* Block for 5s at a time, in between check to see whether we've been
cancelled
and update our progress if not *)
let start = Unix.gettimeofday () in
- let finished = ref false in
- while (Unix.gettimeofday () -. start < total_timeout) && not(!finished) do
+ let result = ref None in
+ while (Unix.gettimeofday () -. start < total_timeout) && (!result = None) do
try
- let r = Xal.wait_release xal ~timeout:5. domid in
- if not (match_xal_and_shutdown r reason) then begin
- let errmsg = Printf.sprintf
- "Domain died with reason: %s when it should have been %s"
- (Xal.string_of_died_reason r) (Domain.string_of_shutdown_reason
reason) in
- debug "%s" errmsg;
- raise (Domain_shutdown_for_wrong_reason r)
- end;
- finished := true;
+ result := Some (Xal.wait_release xal ~timeout:5. domid);
with Xal.Timeout ->
if reason <> Domain.Suspend && TaskHelper.is_cancelling ~__context
then raise (Api_errors.Server_error(Api_errors.task_cancelled, [
Ref.string_of (Context.get_task_id __context) ]));
@@ -837,9 +830,11 @@
let progress = min ((Unix.gettimeofday () -. start) /. total_timeout) 1.
in
at (0.50 +. 0.25 *. progress)
done;
- if not(!finished)
- then raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [
Ref.string_of self; string_of_float total_timeout ]));
- at 1.0
+ match !result with
+ | None -> raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [
Ref.string_of self; string_of_float total_timeout ]))
+ | Some x ->
+ at 1.0;
+ x
(* !!! FIX ME - This allows a 10% overhead on static_max for size of suspend
image !!! *)
let get_suspend_space __context vm =
@@ -877,9 +872,19 @@
Domain.suspend ~xc ~xs ~hvm domid fd []
~progress_callback:progress_cb
(fun () ->
-
clean_shutdown_with_reason ~xal
+
match clean_shutdown_with_reason ~xal
~__context ~self:vm domid
-
Domain.Suspend
+
Domain.Suspend with
+
| Xal.Suspended -> () (* good *)
+
| Xal.Crashed ->
+
raise (Api_errors.Server_error(Api_errors.vm_crashed,
[ Ref.string_of vm ]))
+
| Xal.Rebooted ->
+
raise
(Api_errors.Server_error(Api_errors.vm_rebooted, [ Ref.string_of vm ]))
+
| Xal.Halted
+
| Xal.Vanished ->
+
raise (Api_errors.Server_error(Api_errors.vm_halted,
[ Ref.string_of vm ]))
+
| Xal.Shutdown x ->
+
failwith (Printf.sprintf "Expected domain shutdown
reason: %d" x)
)
);
(* If the suspend
succeeds, set the suspend_VDI *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_fist.ml
--- a/ocaml/xapi/xapi_fist.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_fist.ml Thu Dec 10 23:04:50 2009 +0000
@@ -26,6 +26,8 @@
try
Some (Unixext.read_whole_file_to_string ("/tmp/fist_" ^ name))
with _ -> None
+
+let delete name = Unixext.unlink_safe ("/tmp/fist_" ^ name)
(** Insert 2 * Xapi_globs.max_clock_skew into the heartbeat messages *)
let insert_clock_skew () = fistpoint "insert_clock_skew"
@@ -94,3 +96,18 @@
(** Set the expiry date of a v6-license to the one in the file *)
let set_expiry_date () = fistpoint_read "set_expiry_date"
+(** Forces synchronous lifecycle path to defer to the event thread *)
+let disable_sync_lifecycle_path () = fistpoint "disable_sync_lifecycle_path"
+
+(** Forces synchronous lifecycle path by partially disabling the event thread
*)
+let disable_event_lifecycle_path () = fistpoint "disable_event_lifecycle_path"
+
+(** If set to "reboot" "halt" "suspend" "crash" this will forcibly shutdown
the domain during reboot/shutdown *)
+let simulate_internal_shutdown () =
+ let fist = "simulate_internal_shutdown" in
+ let x = fistpoint_read fist in
+ delete fist;
+ x
+
+(** Disables the artificial reboot delay, for faster testing. *)
+let disable_reboot_delay () = fistpoint "disable_reboot_delay"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.ml
--- a/ocaml/xapi/xapi_vm.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.ml Thu Dec 10 23:04:50 2009 +0000
@@ -298,7 +298,6 @@
(** The signature of a single phase of reboot or shutdown *)
type args = { __context: Context.t;
vm: API.ref_VM;
- token: Locking_helpers.token option;
api_call_name: string;
clean: bool }
@@ -307,31 +306,85 @@
in_guest : args -> unit;
in_dom0 : args -> unit;
}
- let execute (x: args) (y: t) =
- y.in_guest x;
- y.in_dom0 x
+
+ (** Called with the per-VM lock held. Evaluates to true if the VM has been
rebooted (eg by the event thread) *)
+ let is_vm_running x =
+ (* The VM may have been rebooted by the event thread: in this case
there is no work to do *)
+ let domid = Helpers.domid_of_vm x.__context x.vm in
+ true
+ && domid <> -1 (* someone set the state to Halted *)
+ && (with_xc
+ (fun xc ->
+ let di = Xc.domain_getinfo xc domid in
+ Xal.is_running di))
+
+ (** Called before a regular synchronous reboot/shutdown to simulate parallel
in-guest shutdowns *)
+ let simulate_internal_shutdown domid =
+ Helpers.log_exn_continue (Printf.sprintf "simulate_internal_shutdown
domid=%d" domid)
+ (fun () ->
+ match Xapi_fist.simulate_internal_shutdown () with
+ | Some x ->
+ let x = String.strip String.isspace x in
+ with_xc
+ (fun xc ->
+ warn "FIST: simulating
internal %s for domid=%d" x domid;
+ match x with
+ | "reboot" ->
Xc.domain_shutdown xc domid Xc.Reboot
+ | "halt" ->
Xc.domain_shutdown xc domid Xc.Halt
+ | "suspend" ->
Xc.domain_shutdown xc domid Xc.Suspend
+ | "crash" ->
Xc.domain_shutdown xc domid Xc.Crash
+ | _ -> failwith
"Unknown simulate_internal_shutdown code");
+ (* pause for 5s which probably lets the
event thread do something (unless it is disabled) *)
+ Thread.delay 5.
+ | None -> ()
+ ) ()
end
+
module Reboot = struct
(** This module contains the low-level implementation actions, as distinct
from the tangle
of policy which comes later. *)
- let in_guest { TwoPhase.__context = __context; vm=vm; token=token;
api_call_name=api_call_name; clean=clean } =
- if clean then begin
- debug "%s phase 0/3: shutting down existing domain" api_call_name;
- let domid = Helpers.domid_of_vm ~__context ~self:vm in
- with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
- ~at:(fun x -> TaskHelper.set_progress ~__context (x /. 2.))
- ~__context ~self:vm domid Domain.Reboot);
- end else debug "%s phase 0/3: no shutdown request required since this is a
hard_reboot" api_call_name
-
- let in_dom0 { TwoPhase.__context = __context; vm=vm; token=token;
api_call_name=api_call_name; clean=clean } =
+ (** Run without the per-VM lock to request the guest shuts itself down (if
clean) *)
+ let in_guest { TwoPhase.__context = __context; vm=vm;
api_call_name=api_call_name; clean=clean } =
+ let domid = Helpers.domid_of_vm ~__context ~self:vm in
+ TwoPhase.simulate_internal_shutdown domid;
+
+ (* NB a parallel internal halt may leave the domid as -1. If so then
there's no work for us
+ to do here. *)
+ if domid <> -1 then begin
+ if clean then begin
+ debug "%s phase 0/3: shutting down existing domain (domid: %d)"
api_call_name domid;
+ match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+ ~at:(fun x ->
TaskHelper.set_progress ~__context (x /. 2.))
+ ~__context ~self:vm
domid Domain.Reboot) with
+ | Xal.Vanished
+ | Xal.Rebooted -> () (* good *)
+ | Xal.Suspended ->
+ error "VM: %s suspended when asked to reboot"
(Ref.string_of vm)
+ | Xal.Crashed ->
+ error "VM: %s crashed when asked to reboot"
(Ref.string_of vm)
+ | Xal.Halted ->
+ error "VM: %s halted when asked to reboot"
(Ref.string_of vm)
+ end else begin
+ debug "%s phase 0/3: no shutdown request required since this is
a hard_reboot" api_call_name;
+ (* The domain might be killed by the event thread. Again, this
is ok. *)
+ Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown
domid=%d Xc.Reboot" domid)
+ (fun () ->
+ with_xc (fun xc -> Xc.domain_shutdown xc domid
Xc.Reboot)
+ ) ()
+ end
+ end
+
+ (** Once the domain has shutdown and the VM is locked, perform the reboot
immediately *)
+ let in_dom0_already_locked { TwoPhase.__context = __context; vm=vm;
api_call_name=api_call_name; clean=clean } =
License_check.vm ~__context vm;
Stats.time_this "VM reboot (excluding clean shutdown phase)"
(fun () ->
let new_snapshot = Db.VM.get_record ~__context ~self:vm in
- let current_snapshot = Helpers.get_boot_record ~__context ~self:vm in
+
+ let current_snapshot = Helpers.get_boot_record ~__context
~self:vm in
(* Master will have already checked the new memory_max and placed the
max of
the current and new values in the current_snapshot.
Just in case someone raced with us and bumped the static_max
*again* we
@@ -343,8 +396,7 @@
let new_snapshot = { new_snapshot with API.vM_memory_static_max =
new_mem } in
let localhost = Helpers.get_localhost ~__context in
-
- let domid = Helpers.domid_of_vm ~__context ~self:vm in
+ let domid = Helpers.domid_of_vm ~__context ~self:vm in
debug "%s phase 1/3: destroying old domain" api_call_name;
(* CA-13585: prevent glitch where power-state goes to Halted in the
middle of a reboot.
If an error causes us to leave this function then the event thread
should resynchronise
@@ -366,7 +418,6 @@
Helpers.set_boot_record ~__context ~self:vm new_snapshot;
debug "%s phase 2/3: starting new domain" api_call_name;
- Opt.iter (Locking_helpers.assert_locked vm) token;
begin
try
Vmops.start_paused
@@ -391,12 +442,28 @@
);
Db.VM.set_resident_on ~__context ~self:vm ~value:localhost;
Db.VM.set_power_state ~__context ~self:vm ~value:`Running;
- Opt.iter (Locking_helpers.assert_locked vm) token;
with exn ->
error "Caught exception during %s: %s" api_call_name
(ExnHelper.string_of_exn exn);
with_xc_and_xs (fun xc xs -> Vmops.destroy ~__context ~xc ~xs
~self:vm domid `Halted);
raise exn
)
+
+ (** In the synchronous API call paths, acquire the VM lock and see if the VM
hasn't rebooted yet.
+ If necessary we reboot it here. *)
+ let in_dom0_already_queued args =
+ Locking_helpers.with_lock args.TwoPhase.vm
+ (fun _ _ ->
+ if TwoPhase.is_vm_running args
+ then debug "VM %s has already rebooted: taking no
action" (Ref.string_of args.TwoPhase.vm)
+ else in_dom0_already_locked args) ()
+
+ (** In the synchronouse API call paths, wait in the
domU_internal_shutdown_queue and then attempt
+ to reboot the VM. NB this is the same queue used by the event thread.
*)
+ let in_dom0 args =
+ Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+ (Context.string_of_task args.TwoPhase.__context)
+ (fun () -> in_dom0_already_queued args)
+
let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
end
@@ -404,32 +471,64 @@
(** This module contains the low-level implementation actions, as distinct
from the tangle
of policy which comes later. *)
- let in_guest { TwoPhase.__context=__context; vm=vm; token=token;
api_call_name=api_call_name; clean=clean } =
- Opt.iter (Locking_helpers.assert_locked vm) token;
+ (** Run without the per-VM lock to request the guest shuts itself down (if
clean) *)
+ let in_guest { TwoPhase.__context=__context; vm=vm;
api_call_name=api_call_name; clean=clean } =
assert_ha_always_run_is_false ~__context ~vm;
+ let domid = Helpers.domid_of_vm ~__context ~self:vm in
+ TwoPhase.simulate_internal_shutdown domid;
- if clean then begin
- debug "%s: phase 1/2: waiting for the domain to shutdown" api_call_name;
- let domid = Helpers.domid_of_vm ~__context ~self:vm in
-
- with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
- ~at:(TaskHelper.set_progress ~__context)
- ~__context ~self:vm domid Domain.Halt);
- end else debug "%s phase 0/3: no shutdown request required since this is a
hard_shutdown" api_call_name
+ (* NB a parallel internal halt may leave the domid as -1. If so then
there's no work for us
+ to do here. *)
+ if domid <> -1 then begin
+ if clean then begin
+ debug "%s: phase 1/2: waiting for the domain to shutdown"
api_call_name;
+
+ match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+
~at:(TaskHelper.set_progress ~__context)
+ ~__context ~self:vm
domid Domain.Halt) with
+ | Xal.Vanished
+ | Xal.Halted -> () (* good *)
+ | Xal.Suspended ->
+ (* Log the failure but continue *)
+ error "VM: %s suspended when asked to shutdown"
(Ref.string_of vm)
+ | Xal.Crashed ->
+ (* Log the failure but continue *)
+ error "VM: %s crashed when asked to shutdown"
(Ref.string_of vm)
+ | Xal.Rebooted ->
+ (* Log the failure but continue *)
+ error "VM: %s attempted to reboot when asked to
shutdown" (Ref.string_of vm)
+ end else begin
+ debug "%s phase 0/3: no shutdown request required since this is
a hard_shutdown" api_call_name;
+ (* The domain might be killed by the event thread. Again, this
is ok. *)
+ Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown
domid=%d Xc.Halt" domid)
+ (fun () ->
+ with_xc (fun xc -> Xc.domain_shutdown xc domid
Xc.Halt)
+ ) ()
+ end
+ end
- let in_dom0 { TwoPhase.__context=__context; vm=vm; token=token;
api_call_name=api_call_name; clean=clean } =
- (* Invoke pre_destroy hook *)
- Xapi_hooks.vm_pre_destroy ~__context ~reason:(if clean then
Xapi_hooks.reason__clean_shutdown else Xapi_hooks.reason__hard_shutdown) ~vm;
-
+ (** Run with the per-VM lock held to clean up any shutdown domain. Note if
the VM has been rebooted
+ then we abort with OTHER_OPERATION_IN_PROGRESS. See
[retry_on_conflict] *)
+ let in_dom0_already_locked { TwoPhase.__context=__context; vm=vm;
api_call_name=api_call_name; clean=clean } =
+ (* If the VM has been shutdown by the event thread (domid = -1) then
there's no domain to destroy. *)
+ (* If the VM is running again then throw an error to trigger
retry_on_conflict *)
let domid = Helpers.domid_of_vm ~__context ~self:vm in
- if domid <> -1 then begin
- debug "%s: phase 2/2: destroying old domain (domid %d)" api_call_name
domid;
- with_xc_and_xs (fun xc xs ->
- Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
- (* Force an update of the stats - this will cause the
rrds to be synced back to the master *)
- Monitor.do_monitor __context xc
- );
- end;
+ if domid <> -1 then begin
+ with_xc_and_xs
+ (fun xc xs ->
+ let di = Xc.domain_getinfo xc domid in
+ (* If someone rebooted it while we dropped the lock:
*)
+ if Xal.is_running di
+ then raise
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM";
Ref.string_of vm ]));
+
+ (* Invoke pre_destroy hook *)
+ Xapi_hooks.vm_pre_destroy ~__context ~reason:(if
clean then Xapi_hooks.reason__clean_shutdown else
Xapi_hooks.reason__hard_shutdown) ~vm;
+ debug "%s: phase 2/2: destroying old domain (domid
%d)" api_call_name domid;
+ Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
+ (* Force an update of the stats - this will cause the
rrds to be synced back to the master *)
+ Monitor.do_monitor __context xc
+ )
+ end;
if Db.VM.get_power_state ~__context ~self:vm = `Suspended then begin
debug "hard_shutdown: destroying any suspend VDI";
@@ -445,6 +544,22 @@
Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted
end
+ (** In the synchronous API call paths, acquire the lock, check if the VM's
domain has shutdown (if not error out)
+ and continue with the shutdown *)
+ let in_dom0_already_queued args =
+ Locking_helpers.with_lock args.TwoPhase.vm
+ (fun _ _ ->
+ if TwoPhase.is_vm_running args
+ then raise
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM";
Ref.string_of args.TwoPhase.vm ]))
+ else in_dom0_already_locked args) ()
+
+ (** In the synchronouse API call paths, wait in the
domU_internal_shutdown_queue and then attempt
+ to reboot the VM. NB this is the same queue used by the event thread.
*)
+ let in_dom0 args =
+ Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+ (Context.string_of_task args.TwoPhase.__context)
+ (fun () -> in_dom0_already_queued args)
+
let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
end
@@ -453,13 +568,24 @@
| `restart -> Reboot.actions
| `destroy -> Shutdown.actions
-(** Add queueing and locking policy for the external API calls *)
-let impose_external_api_policy (x: TwoPhase.t) : TwoPhase.t =
- let f args =
- Local_work_queue.wait_in_line Local_work_queue.normal_vm_queue
- (Context.string_of_task args.TwoPhase.__context)
- (fun () -> x.TwoPhase.in_dom0 args) in
- { x with TwoPhase.in_dom0 = f }
+(** If our operation conflicts with another parallel operation (i.e. we ask
for shutdown
+ but guest admin asks for reboot) then we raise an
OTHER_OPERATION_IN_PROGRESS exception
+ and retry the whole procedure. *)
+let retry_on_conflict (x: TwoPhase.args) (y: TwoPhase.t) =
+ let rec retry n =
+ try
+ y.TwoPhase.in_guest x;
+ if Xapi_fist.disable_sync_lifecycle_path ()
+ then warn "FIST: disable_sync_lifecycle_path: deferring to the event
thread"
+ else y.TwoPhase.in_dom0 x
+ with
+ | Api_errors.Server_error(code, _) as e when code =
Api_errors.other_operation_in_progress ->
+ let aborting = n < 1 in
+ debug "Conflict when executing %s: %s"
x.TwoPhase.api_call_name (if aborting then "aborting" else "retrying");
+ if aborting then raise e;
+ retry (n - 1) in
+ retry 10
+
(** CA-11132: Record information about the shutdown in odd other-config keys
for Egenera *)
let record_shutdown_details ~__context ~vm reason initiator action =
@@ -480,51 +606,36 @@
(** VM.hard_reboot entrypoint *)
let hard_reboot ~__context ~vm =
- Locking_helpers.with_lock vm
- (fun token () ->
- let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
- record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
- let args = { TwoPhase.__context=__context; vm=vm; token=Some token;
api_call_name="VM.hard_reboot"; clean=false } in
- TwoPhase.execute args (impose_external_api_policy (of_action action))
- ) ()
+ let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+ record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+ let args = { TwoPhase.__context=__context; vm=vm;
api_call_name="VM.hard_reboot"; clean=false } in
+ retry_on_conflict args (of_action action)
(** VM.hard_shutdown entrypoint *)
let hard_shutdown ~__context ~vm =
- Locking_helpers.with_lock vm
- (fun token () ->
- let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
- record_shutdown_details ~__context ~vm Xal.Halted "external" action;
- let args = { TwoPhase.__context=__context; vm=vm; token=Some token;
api_call_name="VM.hard_shutdown"; clean=false } in
- TwoPhase.execute args (impose_external_api_policy (of_action action))
- ) ()
+ let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+ record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+ let args = { TwoPhase.__context=__context; vm=vm;
api_call_name="VM.hard_shutdown"; clean=false } in
+ retry_on_conflict args (of_action action)
(** VM.clean_reboot entrypoint *)
let clean_reboot ~__context ~vm =
- Locking_helpers.with_lock vm
- (fun token () ->
- let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
- record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
- let args = { TwoPhase.__context=__context; vm=vm; token=Some token;
api_call_name="VM.clean_reboot"; clean=true } in
- TwoPhase.execute args (impose_external_api_policy (of_action action))
- ) ()
+ let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+ record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+ let args = { TwoPhase.__context=__context; vm=vm;
api_call_name="VM.clean_reboot"; clean=true } in
+ retry_on_conflict args (of_action action)
(** VM.clean_shutdown entrypoint *)
let clean_shutdown ~__context ~vm =
- Locking_helpers.with_lock vm
- (fun token () ->
- let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
- record_shutdown_details ~__context ~vm Xal.Halted "external" action;
- let args = { TwoPhase.__context=__context; vm=vm; token=Some token;
api_call_name="VM.clean_shutdown"; clean=true } in
- TwoPhase.execute args (impose_external_api_policy (of_action action))
- ) ()
+ let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+ record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+ let args = { TwoPhase.__context=__context; vm=vm;
api_call_name="VM.clean_shutdown"; clean=true } in
+ retry_on_conflict args (of_action action)
(***************************************************************************************)
-(** VM.hard_reboot_internal: called via the event thread *)
-let hard_reboot_internal ~__context ~vm =
- (* VM is locked by the caller *)
- let args = { TwoPhase.__context=__context; vm=vm; token=None;
api_call_name="VM.hard_reboot_internal"; clean=false } in
- Reboot.in_dom0 args
+(** @deprecated *)
+let hard_reboot_internal ~__context ~vm = assert false
(***************************************************************************************)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.mli
--- a/ocaml/xapi/xapi_vm.mli Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.mli Thu Dec 10 23:04:50 2009 +0000
@@ -99,27 +99,26 @@
type args = {
__context : Context.t;
vm : API.ref_VM;
- token : Locking_helpers.token option;
api_call_name : string;
clean : bool;
}
type t = { in_guest : args -> unit; in_dom0 : args -> unit; }
- val execute : args -> t -> unit
end
module Reboot :
sig
val in_guest : TwoPhase.args -> unit
+ val in_dom0_already_locked : TwoPhase.args -> unit
val in_dom0 : TwoPhase.args -> unit
val actions : TwoPhase.t
end
module Shutdown :
sig
val in_guest : TwoPhase.args -> unit
+ val in_dom0_already_locked : TwoPhase.args -> unit
val in_dom0 : TwoPhase.args -> unit
val actions : TwoPhase.t
end
val of_action : [< `destroy | `restart ] -> TwoPhase.t
-val impose_external_api_policy : TwoPhase.t -> TwoPhase.t
val record_shutdown_details :
__context:Context.t ->
vm:[ `VM ] Ref.t ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm_migrate.ml
--- a/ocaml/xapi/xapi_vm_migrate.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm_migrate.ml Thu Dec 10 23:04:50 2009 +0000
@@ -126,9 +126,18 @@
(* If we got the ack, then proceed to shutdown the domain with the suspend
reason. If we failed to get the ack, then raise an exception to abort
the migration *)
- if (ack = `ACKED) then
- Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid Domain.Suspend
- else
+ if (ack = `ACKED) then begin
+ match Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid
Domain.Suspend with
+ | Xal.Suspended -> () (* good *)
+ | Xal.Crashed ->
+ raise (Api_errors.Server_error(Api_errors.vm_crashed, [
Ref.string_of self ]))
+ | Xal.Rebooted ->
+ raise (Api_errors.Server_error(Api_errors.vm_rebooted, [
Ref.string_of self ]))
+ | Xal.Vanished
+ | Xal.Halted ->
+ raise (Api_errors.Server_error(Api_errors.vm_halted, [
Ref.string_of self ]))
+ | Xal.Shutdown x -> vm_migrate_failed (Printf.sprintf "Domain shutdown
for unexpected reason: %d" x)
+ end else
vm_migrate_failed "Failed to receive suspend acknowledgement within
timeout period or an abort was requested."
(* ------------------------------------------------------------------- *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/domain.mli
--- a/ocaml/xenops/domain.mli Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/domain.mli Thu Dec 10 23:04:50 2009 +0000
@@ -85,7 +85,7 @@
val shutdown: xs:Xs.xsh -> domid -> shutdown_reason -> unit
(** Tell the domain to shutdown with reason ''shutdown_reason', waiting for an
ack *)
-val shutdown_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh -> domid ->
shutdown_reason -> bool
+val shutdown_wait_for_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh ->
domid -> shutdown_reason -> unit
(** send a domain a sysrq *)
val sysrq: xs:Xs.xsh -> domid -> char -> unit
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/xenops.ml
--- a/ocaml/xenops/xenops.ml Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/xenops.ml Thu Dec 10 23:04:50 2009 +0000
@@ -76,7 +76,12 @@
printf "built hvm domain: %u\n" domid
let clean_shutdown_domain ~xal ~domid ~reason ~sync =
- let acked = Domain.shutdown_ack (Xal.xc_of_ctx xal) (Xal.xs_of_ctx xal)
domid reason in
+ let xc = Xal.xc_of_ctx xal in
+ let xs = Xal.xs_of_ctx xal in
+ Domain.shutdown ~xs domid reason;
+ (* Wait for any necessary acknowledgement. If we get a Watch.Timeout _ then
+ we abort early; otherwise we continue in Xal.wait_release below. *)
+ let acked = try Domain.shutdown_wait_for_ack ~xc ~xs domid reason; true with
Watch.Timeout _ -> false in
if not acked then (
eprintf "domain %u didn't acknowledged shutdown\n" domid;
) else (
13 files changed, 497 insertions(+), 139 deletions(-)
ocaml/idl/api_errors.ml | 3
ocaml/idl/datamodel.ml | 6
ocaml/xapi/OMakefile | 2
ocaml/xapi/events.ml | 20 +-
ocaml/xapi/quicktest.ml | 1
ocaml/xapi/quicktest_lifecycle.ml | 194 ++++++++++++++++++++++++++
ocaml/xapi/vmops.ml | 91 ++++++------
ocaml/xapi/xapi_fist.ml | 17 ++
ocaml/xapi/xapi_vm.ml | 273 ++++++++++++++++++++++++++-----------
ocaml/xapi/xapi_vm.mli | 5
ocaml/xapi/xapi_vm_migrate.ml | 15 +-
ocaml/xenops/domain.mli | 2
ocaml/xenops/xenops.ml | 7
xen-api.hg-4.patch
Description: Text Data
_______________________________________________
xen-api mailing list
xen-api@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/mailman/listinfo/xen-api
|