WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-api

[Xen-API] [PATCH 4 of 4] CA-33707: fix queueing deadlocking bug by never

To: xen-api@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-API] [PATCH 4 of 4] CA-33707: fix queueing deadlocking bug by never entering a queue with a lock held
From: David Scott <dave.scott@xxxxxxxxxxxxx>
Date: Thu, 10 Dec 2009 23:04:58 +0000
Delivery-date: Thu, 10 Dec 2009 15:08:31 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
In-reply-to: <patchbomb.1260486294@xxxxxxxxxxxxxxxxxxxx>
List-help: <mailto:xen-api-request@lists.xensource.com?subject=help>
List-id: Discussion of API issues surrounding Xen <xen-api.lists.xensource.com>
List-post: <mailto:xen-api@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-api>, <mailto:xen-api-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-api>, <mailto:xen-api-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-api-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User David Scott <dave.scott@xxxxxxxxxxxxx>
# Date 1260486290 0
# Node ID 9edc8c86f01dd5e951b944b5424e5b9383d0780c
# Parent  ca92f46da128588874c6c660aef6409adae119dd
CA-33707: fix queueing deadlocking bug by never entering a queue with a lock 
held.

We now request domain shutdown without the per-VM mutex held. This means these 
may race with the background event thread performing a domain destruction.

The domain destruction/recreation part of {clean,hard}_{shutdown,reboot} is 
placed in the domU_shutdown_queue by both the synchronous API path and the 
event thread.

We remove the Vmops.Domain_shutdown_for_wrong_reason exception and replace it 
with VM_{CRASHED,REBOOTED,HALTED} exceptions. We may yet be able to remove 
these "errors" completely.

Work around many instances of VM_FAILED_SHUTDOWN_ACK by reissuing the shutdown 
request every few seconds until the timeout expires.

If VM.{clean,hard}_reboot runs in parallel with an internal domain reboot then 
only one reboot will probably happen.
If VM.{clean,hard}_shutdown runs in parallel with an internal domain reboot 
then up to 10 retries to shut the VM down will be attempted.

Add FIST points to:
1. disable the event thread's handling of @releaseDomain
2. disable the synchronous API calls handling of domain destruction/recreation
3. disable the artificial VM reboot delay
4. simulate an internal shutdown (via Xc.domain_shutdown)

Add a series of tests to quicktest which run every combination of
* VM.{clean,hard}_{shutdown,reboot}
* parallel internal halt,reboot,crash
* synchronous thread only, event thread only, both

Signed-off-by: David Scott <dave.scott@xxxxxxxxxxxxx>

diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/api_errors.ml
--- a/ocaml/idl/api_errors.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/api_errors.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -136,6 +136,9 @@
 let vm_shutdown_timeout = "VM_SHUTDOWN_TIMEOUT"
 let vm_duplicate_vbd_device = "VM_DUPLICATE_VBD_DEVICE"
 let vm_not_resident_here = "VM_NOT_RESIDENT_HERE"
+let vm_crashed = "VM_CRASHED"
+let vm_rebooted = "VM_REBOOTED"
+let vm_halted = "VM_HALTED"
 let vms_failed_to_cooperate = "VMS_FAILED_TO_COOPERATE"
 let domain_exists = "DOMAIN_EXISTS"
 let cannot_reset_control_domain = "CANNOT_RESET_CONTROL_DOMAIN"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/idl/datamodel.ml
--- a/ocaml/idl/datamodel.ml    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/idl/datamodel.ml    Thu Dec 10 23:04:50 2009 +0000
@@ -618,6 +618,12 @@
     ~doc:"VM didn't acknowledge the need to shutdown." ();
   error Api_errors.vm_shutdown_timeout [ "vm"; "timeout" ]
     ~doc:"VM failed to shutdown before the timeout expired" ();
+  error Api_errors.vm_crashed [ "vm" ]
+         ~doc:"The VM crashed" ();
+  error Api_errors.vm_rebooted [ "vm" ]
+         ~doc:"The VM unexpectedly rebooted" ();
+  error Api_errors.vm_halted [ "vm" ]
+         ~doc:"The VM unexpectedly halted" ();
   error Api_errors.bootloader_failed [ "vm"; "msg" ]
     ~doc:"The bootloader returned an error" ();
   error Api_errors.unknown_bootloader [ "vm"; "bootloader" ]
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/OMakefile
--- a/ocaml/xapi/OMakefile      Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/OMakefile      Thu Dec 10 23:04:50 2009 +0000
@@ -48,7 +48,7 @@
 
 #OCamlProgram(upload_receive, $(COMMON) fileupload upload_receive)
 
-OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest 
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement 
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints 
../util/vm_memory_constraints)
+OCamlProgram(quicktestbin, quicktest quicktest_common quicktest_ocamltest 
quicktest_storage quicktest_http quicktest_encodings quicktest_vm_placement 
vm_placement ../xenops/squeeze_test quicktest_vm_memory_constraints 
../util/vm_memory_constraints quicktest_lifecycle)
 OCamlProgram(stresstest, stresstest)
 OCamlProgram(fakeguestagent, fakeguestagent)
 
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/events.ml
--- a/ocaml/xapi/events.ml      Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/events.ml      Thu Dec 10 23:04:50 2009 +0000
@@ -121,15 +121,19 @@
        (try Db.VM.remove_from_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ());
        Db.VM.add_to_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key ~value:(string_of_int 2);
        0 in
-    debug "Adding artificial delay on reboot for VM: %s. delay time=%d 
seconds" (Ref.string_of vm) delay;
-    Thread.delay (float_of_int delay)
-      
+       if Xapi_fist.disable_reboot_delay ()
+       then debug "FIST: disable_reboot_delay"
+    else begin
+         debug "Adding artificial delay on reboot for VM: %s. delay time=%d 
seconds" (Ref.string_of vm) delay;
+      Thread.delay (float_of_int delay)
+       end
+
   let clear_reboot_delay ~__context ~vm =
     try Db.VM.remove_from_other_config ~__context ~self:vm 
~key:Xapi_globs.last_artificial_reboot_delay_key with _ -> ()
       
   let perform_destroy ~__context ~vm token =
     TaskHelper.set_description ~__context "destroy";
-    Xapi_vm.Shutdown.in_dom0 { Xapi_vm.TwoPhase.__context = __context; vm=vm; 
token=Some token; api_call_name="destroy"; clean=false };
+    Xapi_vm.Shutdown.in_dom0_already_locked { Xapi_vm.TwoPhase.__context = 
__context; vm=vm; api_call_name="destroy"; clean=false };
     update_allowed_ops_using_api ~__context vm
 
   let perform_preserve ~__context ~vm token = 
@@ -523,8 +527,12 @@
                let action_taken = Resync.vm ~__context token vm in
                if action_taken then debug "Action was taken so 
allowed_operations should be updated";             
              in
-             debug "adding Resync.vm to work queue";
-             push vm Local_work_queue.domU_internal_shutdown_queue description 
work_item;
+                 if Xapi_fist.disable_event_lifecycle_path ()
+                 then warn "FIST: disable_event_lifecycle_path: skipping 
Resync.vm"
+                 else begin
+                       debug "adding Resync.vm to work queue";
+                       push vm Local_work_queue.domU_internal_shutdown_queue 
description work_item;
+                 end
           )
        with Vm_corresponding_to_domid_not_in_db domid ->
         error "event could not be processed because VM record not in database"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest.ml
--- a/ocaml/xapi/quicktest.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/quicktest.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -619,6 +619,7 @@
     end;
     vbd_pause_unpause_test s debian;
     powercycle_test s debian;
+       Quicktest_lifecycle.test s debian;
     vm_uninstall test s debian;  
     success test
   with Unable_to_find_suitable_debian_template ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/quicktest_lifecycle.ml
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/ocaml/xapi/quicktest_lifecycle.ml Thu Dec 10 23:04:50 2009 +0000
@@ -0,0 +1,194 @@
+
+
+type 'a api_call = 
+  | Shutdown of 'a
+  | Reboot of 'a
+
+type api_mode = 
+  | Clean
+  | Hard
+
+type api = api_mode api_call
+
+type parallel_op = 
+  | Internal_reboot
+  | Internal_halt
+  | Internal_suspend
+  | Internal_crash
+
+type code_path = 
+  | Sync
+  | Event
+  | Both
+
+type result = 
+  | Rebooted
+  | Halted
+
+let final_guest_state = function
+  | Shutdown _ -> Halted
+  | Reboot _ -> Rebooted
+
+type test = { 
+       api: api option;
+       parallel_op: parallel_op option;
+       code_path: code_path;
+}
+
+let string_of_result = function
+  | Rebooted -> "Reboot"
+  | Halted -> "Halt"
+
+let expected_result = function
+  | { api = Some (Shutdown _); parallel_op = Some _; code_path = (Sync|Both) } 
-> Some Halted
+  | { api = Some (Reboot _);   parallel_op = Some _; code_path = (Sync|Both) } 
-> Some Rebooted
+  | { api = Some (Shutdown _); parallel_op = None;   code_path = 
(Sync|Event|Both) } -> Some Halted
+  | { api = Some (Reboot _);   parallel_op = None;   code_path = 
(Sync|Event|Both) } -> Some Rebooted
+  | { parallel_op = Some (Internal_halt | Internal_crash); code_path = Event } 
-> Some Halted
+  | { parallel_op = Some Internal_reboot; code_path = Event } -> Some Rebooted
+  
+  | { api = None; parallel_op = Some (Internal_halt (* | Internal_suspend *) | 
Internal_crash); code_path = (Event|Both) } -> Some Halted
+  | { api = None; parallel_op = Some Internal_reboot; code_path = (Event|Both) 
 } -> Some Rebooted
+  | _ -> None (* invalid test *)
+
+
+let string_of_test x = 
+  let string_of_api = function
+       | Shutdown Clean   -> "clean_shutdown"
+       | Shutdown Hard    -> "hard_shutdown "
+       | Reboot Clean     -> "clean_reboot  "
+       | Reboot Hard      -> "hard_reboot   " in
+  let string_of_parallel_op = function
+       | Internal_reboot  -> "reboot        "
+       | Internal_halt    -> "halt          "
+       | Internal_suspend -> "suspend       "
+       | Internal_crash   -> "crash         " in
+  let string_of_code_path = function
+       | Sync             -> "synch         "
+       | Event            -> "event         " 
+       | Both             -> "both          " in
+  let dm f x = match x with 
+       | None             -> "Nothing       " 
+       | Some x           -> f x in
+  Printf.sprintf "%s %s %s -> %s" 
+         (dm string_of_api x.api) (dm string_of_parallel_op x.parallel_op) 
(string_of_code_path x.code_path)
+         (match expected_result x with None -> "invalid" | Some y -> 
string_of_result y)
+open List
+
+let all_possible_tests =
+  let all_api_variants x = 
+       [ { x with api = None };
+         { x with api = Some (Shutdown Clean) };
+         { x with api = Some (Shutdown Hard) };
+         { x with api = Some (Reboot Clean) };
+         { x with api = Some (Reboot Hard) } ] in
+  let all_parallel_op_variants x = 
+       [ { x with parallel_op = None };
+         { x with parallel_op = Some Internal_reboot };
+         { x with parallel_op = Some Internal_halt };
+         { x with parallel_op = Some Internal_suspend };
+         { x with parallel_op = Some Internal_crash } ] in
+  let all_code_path_variants x = 
+       [ { x with code_path = Sync };
+         { x with code_path = Event };
+         { x with code_path = Both } ] in
+
+  let xs = [ { api = None; parallel_op = None; code_path = Sync } ] in
+  concat (map all_code_path_variants (concat (map all_parallel_op_variants 
(concat (map all_api_variants xs)))))
+                       
+let all_valid_tests = List.filter (fun t -> expected_result t <> None) 
all_possible_tests
+
+         (*
+let _ = 
+  List.iter print_endline (map string_of_test all_valid_tests);
+  Printf.printf "In total there are %d tests.\n" (List.length all_valid_tests)
+         *)
+
+open Quicktest_common
+open Client
+open Pervasiveext
+
+let one s debian test = 
+  let t = make_test (string_of_test test) 1 in
+  start t;
+  let event = "/tmp/fist_disable_event_lifecycle_path" in
+  let sync = "/tmp/fist_disable_sync_lifecycle_path" in
+  let simulate = "/tmp/fist_simulate_internal_shutdown" in
+  let delay = "/tmp/fist_disable_reboot_delay" in
+
+  finally
+         (fun () ->
+                  try
+                        begin 
+                          Unixext.unlink_safe simulate;
+                          Unixext.touch_file delay;
+                          match test.code_path with
+                          | Sync ->
+                                        Unixext.unlink_safe sync;
+                                        Unixext.touch_file event
+                          | Event ->
+                                        Unixext.unlink_safe event;
+                                        Unixext.touch_file sync
+                          | Both ->
+                                        Unixext.unlink_safe sync;
+                                        Unixext.unlink_safe event
+                        end;
+                          if Client.VM.get_power_state !rpc s debian = `Halted
+                          then Client.VM.start !rpc s debian false false;
+                          
+                          let call_api = function
+                                | Shutdown Clean -> Client.VM.clean_shutdown 
!rpc s debian
+                                | Shutdown Hard -> Client.VM.hard_shutdown 
!rpc s debian
+                                | Reboot Clean -> Client.VM.clean_reboot !rpc 
s debian
+                                | Reboot Hard -> Client.VM.hard_reboot !rpc s 
debian in
+                          
+                          let domid = Client.VM.get_domid !rpc s debian in
+                          begin match test with
+                          | { api = None; parallel_op = Some x } ->
+                                        let reason = match x with
+                                          | Internal_reboot -> Xc.Reboot
+                                          | Internal_halt -> Xc.Halt
+                                          | Internal_crash -> Xc.Crash
+                                          | Internal_suspend -> Xc.Suspend in
+                                        Xc.with_intf (fun xc -> 
Xc.domain_shutdown xc (Int64.to_int domid) reason)
+                          | { api = Some x; parallel_op = Some y } ->
+                                        let reason = match y with
+                                          | Internal_reboot -> "reboot"
+                                          | Internal_halt -> "halt"
+                                          | Internal_crash -> "crash"
+                                          | Internal_suspend -> "suspend" in
+                                        Unixext.write_string_to_file simulate 
reason;
+                                        call_api x
+                          | { api = Some x; parallel_op = None } ->
+                                        call_api x
+                          | t -> failwith (Printf.sprintf "Invalid test: %s" 
(string_of_test t))
+                          end;
+                          
+                          let wait_for_domid p =
+                                let start = Unix.gettimeofday () in
+                                let finished = ref false in
+                                while Unix.gettimeofday () -. start < 300. && 
(not !finished) do
+                                  finished := p (Client.VM.get_domid !rpc s 
debian);
+                                        if not !finished then Thread.delay 1.
+                                done;
+                                if not !finished then failwith "timeout"
+                          in
+                          
+                          begin match expected_result test with
+                          | None -> failwith (Printf.sprintf "Invalid test: 
%s" (string_of_test test))
+                          | Some Rebooted ->
+                                        wait_for_domid (fun domid' -> domid <> 
domid')
+                          | Some Halted ->
+                                        wait_for_domid (fun domid' -> domid' = 
-1L)
+                          end
+                  with e -> failed t (Printexc.to_string e)
+         )
+         (fun () ->
+                  Unixext.unlink_safe sync;
+                  Unixext.unlink_safe event;
+                  Unixext.unlink_safe delay
+         );
+  success t
+
+let test s debian = 
+  List.iter (one s debian) all_valid_tests
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/vmops.ml
--- a/ocaml/xapi/vmops.ml       Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/vmops.ml       Thu Dec 10 23:04:50 2009 +0000
@@ -777,59 +777,52 @@
     )
 
 
-
-let match_xal_and_shutdown xalreason reason =
-       debug "Comparing XAL %s with Domain %s"
-             (Xal.string_of_died_reason xalreason)
-             (Domain.string_of_shutdown_reason reason);
-       match xalreason, reason with
-       | Xal.Crashed, _ -> false
-       | Xal.Vanished, _ -> false
-       | Xal.Halted, (Domain.Halt | Domain.PowerOff) -> true
-       | Xal.Rebooted, Domain.Reboot -> true
-       | Xal.Suspended, Domain.Suspend -> true
-       | Xal.Shutdown i, Domain.Unknown i2 -> i = i2
-       | _, _ -> false
-
 (** Thrown if clean_shutdown_with_reason exits for the wrong reason: eg the 
domain
     crashed or rebooted *)
 exception Domain_shutdown_for_wrong_reason of Xal.died_reason
 
-(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff). *)
+(** Tells a VM to shutdown with a specific reason (reboot/halt/poweroff), 
waits for
+    it to shutdown (or vanish) and then return the reason.
+       Note this is not always called with the per-VM mutex. *)
 let clean_shutdown_with_reason ?(at = fun _ -> ()) ~xal ~__context ~self domid 
reason =
   (* Set the task allowed_operations to include cancel *)
   if reason <> Domain.Suspend then TaskHelper.set_cancellable ~__context;
 
   at 0.25;
-  (* Windows PV drivers will respond within 10s according to ssmith and
-     improving this is likely to happen in a Rio timeframe (CA-3964). It's
-     still possible (although unlikely) for us to timeout just before the
-     drivers activate but the worst we'll suffer is a shutdown failure
-     followed by a spontaneous shutdown (which can happen anyway). Having
-     this check in here allows us to bail out quickly in the common case
-     of the PV drivers being missing. *)
-  with_xs (fun xs ->
-            let xc = Xal.xc_of_ctx xal in
-            if not (Domain.shutdown_ack ~timeout:60. ~xc ~xs domid reason) then
-              raise (Api_errors.Server_error 
(Api_errors.vm_failed_shutdown_ack, []))
-         );
+  let xs = Xal.xs_of_ctx xal in
+  let xc = Xal.xc_of_ctx xal in
+  begin
+       (* Wait for up to 60s for the VM to acknowledge the shutdown request. 
In case the guest
+          misses our original request, keep making additional ones. *)
+       let finished = ref false in
+       let timeout = 60.0 in
+       let start = Unix.gettimeofday () in
+       while Unix.gettimeofday () -. start < timeout && not !finished do
+         try
+               (* Make the shutdown request: this will fail if the domain has 
vanished. *)
+               Domain.shutdown ~xs domid reason;
+               (* Wait for any necessary acknowledgement. If we get a 
Watch.Timeout _ then
+                  we abort early; otherwise we continue in Xal.wait_release 
below. *)
+               Domain.shutdown_wait_for_ack ~timeout:10. ~xc ~xs domid reason;
+               finished := true
+         with 
+         | Watch.Timeout _ -> () (* try again *)
+         | e ->
+                       debug "Caught and ignoring exception: %s" 
(ExnHelper.string_of_exn e);
+                       log_backtrace ();
+                       finished := true
+       done;
+       if not !finished then raise (Api_errors.Server_error 
(Api_errors.vm_failed_shutdown_ack, []))
+  end;
   at 0.50;
   let total_timeout = 20. *. 60. in (* 20 minutes *)
   (* Block for 5s at a time, in between check to see whether we've been 
cancelled
      and update our progress if not *)
   let start = Unix.gettimeofday () in
-  let finished = ref false in
-  while (Unix.gettimeofday () -. start < total_timeout) && not(!finished) do
+  let result = ref None in
+  while (Unix.gettimeofday () -. start < total_timeout) && (!result = None) do
     try
-      let r = Xal.wait_release xal ~timeout:5. domid in
-      if not (match_xal_and_shutdown r reason) then begin
-       let errmsg = Printf.sprintf 
-         "Domain died with reason: %s when it should have been %s" 
-         (Xal.string_of_died_reason r) (Domain.string_of_shutdown_reason 
reason) in
-       debug "%s" errmsg;
-       raise (Domain_shutdown_for_wrong_reason r)
-      end;
-      finished := true;
+      result := Some (Xal.wait_release xal ~timeout:5. domid);
     with Xal.Timeout -> 
       if reason <> Domain.Suspend && TaskHelper.is_cancelling ~__context
       then raise (Api_errors.Server_error(Api_errors.task_cancelled, [ 
Ref.string_of (Context.get_task_id __context) ]));
@@ -837,9 +830,11 @@
       let progress = min ((Unix.gettimeofday () -. start) /. total_timeout) 1. 
in
       at (0.50 +. 0.25 *. progress)
   done;
-  if not(!finished)
-  then raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [ 
Ref.string_of self; string_of_float total_timeout ]));
-  at 1.0
+  match !result with
+  | None -> raise (Api_errors.Server_error(Api_errors.vm_shutdown_timeout, [ 
Ref.string_of self; string_of_float total_timeout ]))
+  | Some x ->
+               at 1.0;
+               x
 
 (* !!! FIX ME  - This allows a 10% overhead on static_max for size of suspend 
image !!! *)
 let get_suspend_space __context vm =
@@ -877,9 +872,19 @@
                                                                        
Domain.suspend ~xc ~xs ~hvm domid fd []
                                                                                
~progress_callback:progress_cb
                                                                                
(fun () ->
-                                                                               
        clean_shutdown_with_reason ~xal
+                                                                               
        match clean_shutdown_with_reason ~xal
                                                                                
                ~__context ~self:vm domid
-                                                                               
                Domain.Suspend
+                                                                               
                Domain.Suspend with
+                                                                               
                | Xal.Suspended -> () (* good *)
+                                                                               
                | Xal.Crashed ->
+                                                                               
                          raise (Api_errors.Server_error(Api_errors.vm_crashed, 
[ Ref.string_of vm ]))
+                                                                               
                | Xal.Rebooted ->
+                                                                               
                          raise 
(Api_errors.Server_error(Api_errors.vm_rebooted, [ Ref.string_of vm ]))
+                                                                               
                | Xal.Halted
+                                                                               
                | Xal.Vanished ->
+                                                                               
                          raise (Api_errors.Server_error(Api_errors.vm_halted, 
[ Ref.string_of vm ]))
+                                                                               
                | Xal.Shutdown x ->
+                                                                               
                          failwith (Printf.sprintf "Expected domain shutdown 
reason: %d" x)
                                                                                
)
                                                                );
                                                        (* If the suspend 
succeeds, set the suspend_VDI *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_fist.ml
--- a/ocaml/xapi/xapi_fist.ml   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_fist.ml   Thu Dec 10 23:04:50 2009 +0000
@@ -26,6 +26,8 @@
        try
                Some (Unixext.read_whole_file_to_string ("/tmp/fist_" ^ name))
        with _ -> None
+
+let delete name = Unixext.unlink_safe ("/tmp/fist_" ^ name)
 
 (** Insert 2 * Xapi_globs.max_clock_skew into the heartbeat messages *)
 let insert_clock_skew             () = fistpoint "insert_clock_skew"
@@ -94,3 +96,18 @@
 (** Set the expiry date of a v6-license to the one in the file *)
 let set_expiry_date () = fistpoint_read "set_expiry_date"
 
+(** Forces synchronous lifecycle path to defer to the event thread *)
+let disable_sync_lifecycle_path () = fistpoint "disable_sync_lifecycle_path"
+
+(** Forces synchronous lifecycle path by partially disabling the event thread 
*)
+let disable_event_lifecycle_path () = fistpoint "disable_event_lifecycle_path"
+
+(** If set to "reboot" "halt" "suspend" "crash" this will forcibly shutdown 
the domain during reboot/shutdown *)
+let simulate_internal_shutdown () = 
+  let fist = "simulate_internal_shutdown" in
+  let x = fistpoint_read fist in
+  delete fist;
+  x
+
+(** Disables the artificial reboot delay, for faster testing. *)
+let disable_reboot_delay () = fistpoint "disable_reboot_delay"
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.ml
--- a/ocaml/xapi/xapi_vm.ml     Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.ml     Thu Dec 10 23:04:50 2009 +0000
@@ -298,7 +298,6 @@
   (** The signature of a single phase of reboot or shutdown *)
   type args = { __context: Context.t;
                vm: API.ref_VM;
-               token: Locking_helpers.token option;
                api_call_name: string;
                clean: bool }
 
@@ -307,31 +306,85 @@
     in_guest : args -> unit;
     in_dom0 : args -> unit;
   }
-  let execute (x: args) (y: t) = 
-    y.in_guest x;
-    y.in_dom0 x
+
+  (** Called with the per-VM lock held. Evaluates to true if the VM has been 
rebooted (eg by the event thread) *)
+  let is_vm_running x =
+       (* The VM may have been rebooted by the event thread: in this case 
there is no work to do *)
+       let domid = Helpers.domid_of_vm x.__context x.vm in
+       true
+       && domid <> -1 (* someone set the state to Halted *)
+       && (with_xc
+                       (fun xc ->
+                                let di = Xc.domain_getinfo xc domid in
+                                Xal.is_running di))
+
+  (** Called before a regular synchronous reboot/shutdown to simulate parallel 
in-guest shutdowns *)
+  let simulate_internal_shutdown domid = 
+       Helpers.log_exn_continue (Printf.sprintf "simulate_internal_shutdown 
domid=%d" domid)
+               (fun () ->
+                        match Xapi_fist.simulate_internal_shutdown () with
+                        | Some x ->
+                                  let x = String.strip String.isspace x in
+                                  with_xc
+                                          (fun xc ->
+                                                       warn "FIST: simulating 
internal %s for domid=%d" x domid;
+                                                       match x with
+                                                       | "reboot" -> 
Xc.domain_shutdown xc domid Xc.Reboot
+                                                       | "halt" -> 
Xc.domain_shutdown xc domid Xc.Halt
+                                                       | "suspend" -> 
Xc.domain_shutdown xc domid Xc.Suspend
+                                                       | "crash" -> 
Xc.domain_shutdown xc domid Xc.Crash
+                                                       | _ -> failwith 
"Unknown simulate_internal_shutdown code");
+                                  (* pause for 5s which probably lets the 
event thread do something (unless it is disabled) *)
+                                  Thread.delay 5.
+                        | None -> ()
+               ) ()
 end
+
 
 
 module Reboot = struct
   (** This module contains the low-level implementation actions, as distinct 
from the tangle
       of policy which comes later. *)
 
-  let in_guest { TwoPhase.__context = __context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    if clean then begin
-      debug "%s phase 0/3: shutting down existing domain" api_call_name;
-      let domid = Helpers.domid_of_vm ~__context ~self:vm in
-      with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
-                  ~at:(fun x -> TaskHelper.set_progress ~__context (x /. 2.))
-                  ~__context ~self:vm domid Domain.Reboot);
-    end else debug "%s phase 0/3: no shutdown request required since this is a 
hard_reboot" api_call_name
-      
-  let in_dom0 { TwoPhase.__context = __context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
+  (** Run without the per-VM lock to request the guest shuts itself down (if 
clean) *)
+  let in_guest { TwoPhase.__context = __context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
+    let domid = Helpers.domid_of_vm ~__context ~self:vm in
+       TwoPhase.simulate_internal_shutdown domid;
+
+       (* NB a parallel internal halt may leave the domid as -1. If so then 
there's no work for us 
+          to do here. *)
+       if domid <> -1 then begin
+      if clean then begin
+               debug "%s phase 0/3: shutting down existing domain (domid: %d)" 
api_call_name domid;
+               match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+                                                       ~at:(fun x -> 
TaskHelper.set_progress ~__context (x /. 2.))
+                                                       ~__context ~self:vm 
domid Domain.Reboot) with
+               | Xal.Vanished
+               | Xal.Rebooted -> () (* good *)
+               | Xal.Suspended ->
+                         error "VM: %s suspended when asked to reboot" 
(Ref.string_of vm)
+               | Xal.Crashed ->
+                         error "VM: %s crashed when asked to reboot" 
(Ref.string_of vm)
+               | Xal.Halted ->
+                         error "VM: %s halted when asked to reboot" 
(Ref.string_of vm)
+      end else begin
+               debug "%s phase 0/3: no shutdown request required since this is 
a hard_reboot" api_call_name;
+               (* The domain might be killed by the event thread. Again, this 
is ok. *)
+               Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown 
domid=%d Xc.Reboot" domid)
+                       (fun () -> 
+                                with_xc (fun xc -> Xc.domain_shutdown xc domid 
Xc.Reboot)
+                       ) ()
+         end
+       end
+
+  (** Once the domain has shutdown and the VM is locked, perform the reboot 
immediately *)
+  let in_dom0_already_locked { TwoPhase.__context = __context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
     License_check.vm ~__context vm;
     Stats.time_this "VM reboot (excluding clean shutdown phase)"
       (fun () ->
          let new_snapshot = Db.VM.get_record ~__context ~self:vm in
-        let current_snapshot = Helpers.get_boot_record ~__context ~self:vm in
+
+                let current_snapshot = Helpers.get_boot_record ~__context 
~self:vm in
         (* Master will have already checked the new memory_max and placed the 
max of
            the current and new values in the current_snapshot.
            Just in case someone raced with us and bumped the static_max 
*again* we
@@ -343,8 +396,7 @@
         let new_snapshot = { new_snapshot with API.vM_memory_static_max = 
new_mem } in
         
         let localhost = Helpers.get_localhost ~__context in
-        
-         let domid = Helpers.domid_of_vm ~__context ~self:vm in
+        let domid = Helpers.domid_of_vm ~__context ~self:vm in
          debug "%s phase 1/3: destroying old domain" api_call_name;
         (* CA-13585: prevent glitch where power-state goes to Halted in the 
middle of a reboot.
            If an error causes us to leave this function then the event thread 
should resynchronise
@@ -366,7 +418,6 @@
         Helpers.set_boot_record ~__context ~self:vm new_snapshot;
         
          debug "%s phase 2/3: starting new domain" api_call_name;
-        Opt.iter (Locking_helpers.assert_locked vm) token;
         begin
           try
              Vmops.start_paused
@@ -391,12 +442,28 @@
                          );
           Db.VM.set_resident_on ~__context ~self:vm ~value:localhost;
            Db.VM.set_power_state ~__context ~self:vm ~value:`Running;
-          Opt.iter (Locking_helpers.assert_locked vm) token;
         with exn ->
           error "Caught exception during %s: %s" api_call_name 
(ExnHelper.string_of_exn exn);
           with_xc_and_xs (fun xc xs -> Vmops.destroy ~__context ~xc ~xs 
~self:vm domid `Halted);
           raise exn     
       )
+
+  (** In the synchronous API call paths, acquire the VM lock and see if the VM 
hasn't rebooted yet.
+         If necessary we reboot it here. *)
+  let in_dom0_already_queued args = 
+       Locking_helpers.with_lock args.TwoPhase.vm 
+               (fun _ _ -> 
+                        if TwoPhase.is_vm_running args
+                        then debug "VM %s has already rebooted: taking no 
action" (Ref.string_of args.TwoPhase.vm)
+                        else in_dom0_already_locked args) ()
+
+  (** In the synchronouse API call paths, wait in the 
domU_internal_shutdown_queue and then attempt 
+         to reboot the VM. NB this is the same queue used by the event thread. 
*)
+  let in_dom0 args =
+    Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+      (Context.string_of_task args.TwoPhase.__context)
+      (fun () -> in_dom0_already_queued args)
+
   let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
 end
 
@@ -404,32 +471,64 @@
   (** This module contains the low-level implementation actions, as distinct 
from the tangle
       of policy which comes later. *)
 
-  let in_guest { TwoPhase.__context=__context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    Opt.iter (Locking_helpers.assert_locked vm) token;
+  (** Run without the per-VM lock to request the guest shuts itself down (if 
clean) *)
+  let in_guest { TwoPhase.__context=__context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
     assert_ha_always_run_is_false ~__context ~vm;
+    let domid = Helpers.domid_of_vm ~__context ~self:vm in
+       TwoPhase.simulate_internal_shutdown domid;
 
-    if clean then begin
-      debug "%s: phase 1/2: waiting for the domain to shutdown" api_call_name;
-      let domid = Helpers.domid_of_vm ~__context ~self:vm in
-      
-      with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
-                 ~at:(TaskHelper.set_progress ~__context)
-                 ~__context ~self:vm domid Domain.Halt);
-    end else debug "%s phase 0/3: no shutdown request required since this is a 
hard_shutdown" api_call_name
+       (* NB a parallel internal halt may leave the domid as -1. If so then 
there's no work for us 
+          to do here. *)
+       if domid <> -1 then begin
+      if clean then begin
+               debug "%s: phase 1/2: waiting for the domain to shutdown" 
api_call_name;
+               
+               match with_xal (fun xal -> Vmops.clean_shutdown_with_reason ~xal
+                                                       
~at:(TaskHelper.set_progress ~__context)
+                                                       ~__context ~self:vm 
domid Domain.Halt) with
+               | Xal.Vanished
+               | Xal.Halted -> () (* good *)
+               | Xal.Suspended ->
+                         (* Log the failure but continue *)
+                         error "VM: %s suspended when asked to shutdown" 
(Ref.string_of vm)
+               | Xal.Crashed ->
+                         (* Log the failure but continue *)
+                       error "VM: %s crashed when asked to shutdown" 
(Ref.string_of vm)
+               | Xal.Rebooted ->
+                         (* Log the failure but continue *)
+                         error "VM: %s attempted to reboot when asked to 
shutdown" (Ref.string_of vm)
+      end else begin
+               debug "%s phase 0/3: no shutdown request required since this is 
a hard_shutdown" api_call_name;
+               (* The domain might be killed by the event thread. Again, this 
is ok. *)
+               Helpers.log_exn_continue (Printf.sprintf "Xc.domain_shutdown 
domid=%d Xc.Halt" domid)
+                       (fun () -> 
+                                with_xc (fun xc -> Xc.domain_shutdown xc domid 
Xc.Halt)
+                       ) ()
+         end
+       end
 
-  let in_dom0 { TwoPhase.__context=__context; vm=vm; token=token; 
api_call_name=api_call_name; clean=clean } =
-    (* Invoke pre_destroy hook *)
-    Xapi_hooks.vm_pre_destroy ~__context ~reason:(if clean then 
Xapi_hooks.reason__clean_shutdown else Xapi_hooks.reason__hard_shutdown) ~vm;
-
+  (** Run with the per-VM lock held to clean up any shutdown domain. Note if 
the VM has been rebooted
+         then we abort with OTHER_OPERATION_IN_PROGRESS. See 
[retry_on_conflict] *)
+  let in_dom0_already_locked { TwoPhase.__context=__context; vm=vm; 
api_call_name=api_call_name; clean=clean } =
+       (* If the VM has been shutdown by the event thread (domid = -1) then 
there's no domain to destroy. *)
+       (* If the VM is running again then throw an error to trigger 
retry_on_conflict *)
     let domid = Helpers.domid_of_vm ~__context ~self:vm in
-    if domid <> -1 then begin
-      debug "%s: phase 2/2: destroying old domain (domid %d)" api_call_name 
domid;
-      with_xc_and_xs (fun xc xs ->
-                     Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
-                     (* Force an update of the stats - this will cause the 
rrds to be synced back to the master *)
-                     Monitor.do_monitor __context xc
-                  );
-    end;
+       if domid <> -1 then begin
+         with_xc_and_xs 
+                 (fun xc xs ->
+                          let di = Xc.domain_getinfo xc domid in
+                          (* If someone rebooted it while we dropped the lock: 
*)
+                          if Xal.is_running di
+                          then raise 
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM"; 
Ref.string_of vm ]));
+                          
+                          (* Invoke pre_destroy hook *)
+                          Xapi_hooks.vm_pre_destroy ~__context ~reason:(if 
clean then Xapi_hooks.reason__clean_shutdown else 
Xapi_hooks.reason__hard_shutdown) ~vm;
+                          debug "%s: phase 2/2: destroying old domain (domid 
%d)" api_call_name domid;
+                      Vmops.destroy ~__context ~xc ~xs ~self:vm domid `Halted;
+                      (* Force an update of the stats - this will cause the 
rrds to be synced back to the master *)
+                      Monitor.do_monitor __context xc
+                 )
+       end;
 
     if Db.VM.get_power_state ~__context ~self:vm = `Suspended then begin
       debug "hard_shutdown: destroying any suspend VDI";
@@ -445,6 +544,22 @@
        Xapi_vm_lifecycle.force_state_reset ~__context ~self:vm ~value:`Halted
     end
 
+  (** In the synchronous API call paths, acquire the lock, check if the VM's 
domain has shutdown (if not error out)
+         and continue with the shutdown *)
+  let in_dom0_already_queued args = 
+       Locking_helpers.with_lock args.TwoPhase.vm 
+               (fun _ _ -> 
+                        if TwoPhase.is_vm_running args
+                        then raise 
(Api_errors.Server_error(Api_errors.other_operation_in_progress, [ "VM"; 
Ref.string_of args.TwoPhase.vm ]))
+                        else in_dom0_already_locked args) ()
+
+  (** In the synchronouse API call paths, wait in the 
domU_internal_shutdown_queue and then attempt 
+         to reboot the VM. NB this is the same queue used by the event thread. 
*)
+  let in_dom0 args =
+    Local_work_queue.wait_in_line Local_work_queue.domU_internal_shutdown_queue
+      (Context.string_of_task args.TwoPhase.__context)
+      (fun () -> in_dom0_already_queued args)
+
   let actions = { TwoPhase.in_guest = in_guest; in_dom0 = in_dom0 }
 end
 
@@ -453,13 +568,24 @@
   | `restart -> Reboot.actions
   | `destroy -> Shutdown.actions
 
-(** Add queueing and locking policy for the external API calls *)
-let impose_external_api_policy (x: TwoPhase.t) : TwoPhase.t = 
-  let f args = 
-    Local_work_queue.wait_in_line Local_work_queue.normal_vm_queue
-      (Context.string_of_task args.TwoPhase.__context)
-      (fun () -> x.TwoPhase.in_dom0 args) in
-  { x with TwoPhase.in_dom0 = f }
+(** If our operation conflicts with another parallel operation (i.e. we ask 
for shutdown
+       but guest admin asks for reboot) then we raise an 
OTHER_OPERATION_IN_PROGRESS exception 
+       and retry the whole procedure. *)
+let retry_on_conflict (x: TwoPhase.args) (y: TwoPhase.t) =
+  let rec retry n = 
+       try 
+         y.TwoPhase.in_guest x;
+         if Xapi_fist.disable_sync_lifecycle_path ()
+         then warn "FIST: disable_sync_lifecycle_path: deferring to the event 
thread"
+         else y.TwoPhase.in_dom0 x
+       with 
+       | Api_errors.Server_error(code, _) as e when code = 
Api_errors.other_operation_in_progress ->
+                 let aborting = n < 1 in
+                 debug "Conflict when executing %s: %s" 
x.TwoPhase.api_call_name (if aborting then "aborting" else "retrying");
+                 if aborting then raise e;
+                 retry (n - 1) in
+  retry 10
+  
 
 (** CA-11132: Record information about the shutdown in odd other-config keys 
for Egenera *)
 let record_shutdown_details ~__context ~vm reason initiator action = 
@@ -480,51 +606,36 @@
   
 (** VM.hard_reboot entrypoint *)
 let hard_reboot ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.hard_reboot"; clean=false } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-       ) ()
+  let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.hard_reboot"; clean=false } in
+  retry_on_conflict args (of_action action)
 
 (** VM.hard_shutdown entrypoint *)
 let hard_shutdown ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Halted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.hard_shutdown"; clean=false } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.hard_shutdown"; clean=false } in
+  retry_on_conflict args (of_action action)
 
 (** VM.clean_reboot entrypoint *)
 let clean_reboot ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.clean_reboot"; clean=true } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_reboot ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Rebooted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.clean_reboot"; clean=true } in
+  retry_on_conflict args (of_action action)
 
 (** VM.clean_shutdown entrypoint *)
 let clean_shutdown ~__context ~vm =
-  Locking_helpers.with_lock vm
-    (fun token () ->
-       let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
-       record_shutdown_details ~__context ~vm Xal.Halted "external" action;
-       let args = { TwoPhase.__context=__context; vm=vm; token=Some token; 
api_call_name="VM.clean_shutdown"; clean=true } in
-       TwoPhase.execute args (impose_external_api_policy (of_action action))
-    ) ()
+  let action = Db.VM.get_actions_after_shutdown ~__context ~self:vm in
+  record_shutdown_details ~__context ~vm Xal.Halted "external" action;
+  let args = { TwoPhase.__context=__context; vm=vm; 
api_call_name="VM.clean_shutdown"; clean=true } in
+  retry_on_conflict args (of_action action)
 
 
(***************************************************************************************)
 
-(** VM.hard_reboot_internal: called via the event thread *)
-let hard_reboot_internal ~__context ~vm = 
-  (* VM is locked by the caller *)
-  let args = { TwoPhase.__context=__context; vm=vm; token=None; 
api_call_name="VM.hard_reboot_internal"; clean=false } in
-  Reboot.in_dom0 args
+(** @deprecated *)
+let hard_reboot_internal ~__context ~vm = assert false
 
 
(***************************************************************************************)
 
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm.mli
--- a/ocaml/xapi/xapi_vm.mli    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm.mli    Thu Dec 10 23:04:50 2009 +0000
@@ -99,27 +99,26 @@
     type args = {
       __context : Context.t;
       vm : API.ref_VM;
-      token : Locking_helpers.token option;
       api_call_name : string;
       clean : bool;
     }
     type t = { in_guest : args -> unit; in_dom0 : args -> unit; }
-    val execute : args -> t -> unit
   end
 module Reboot :
   sig
     val in_guest : TwoPhase.args -> unit
+       val in_dom0_already_locked : TwoPhase.args -> unit
     val in_dom0 : TwoPhase.args -> unit
     val actions : TwoPhase.t
   end
 module Shutdown :
   sig
     val in_guest : TwoPhase.args -> unit
+       val in_dom0_already_locked : TwoPhase.args -> unit
     val in_dom0 : TwoPhase.args -> unit
     val actions : TwoPhase.t
   end
 val of_action : [< `destroy | `restart ] -> TwoPhase.t
-val impose_external_api_policy : TwoPhase.t -> TwoPhase.t
 val record_shutdown_details :
   __context:Context.t ->
   vm:[ `VM ] Ref.t ->
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xapi/xapi_vm_migrate.ml
--- a/ocaml/xapi/xapi_vm_migrate.ml     Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xapi/xapi_vm_migrate.ml     Thu Dec 10 23:04:50 2009 +0000
@@ -126,9 +126,18 @@
   (* If we got the ack, then proceed to shutdown the domain with the suspend
      reason.  If we failed to get the ack, then raise an exception to abort
      the migration *)
-  if (ack = `ACKED) then 
-    Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid Domain.Suspend
-  else 
+  if (ack = `ACKED) then begin
+    match Vmops.clean_shutdown_with_reason ~xal ~__context ~self domid 
Domain.Suspend with
+       | Xal.Suspended -> () (* good *)
+       | Xal.Crashed ->
+                 raise (Api_errors.Server_error(Api_errors.vm_crashed, [ 
Ref.string_of self ]))
+       | Xal.Rebooted ->
+                 raise (Api_errors.Server_error(Api_errors.vm_rebooted, [ 
Ref.string_of self ]))       
+       | Xal.Vanished
+       | Xal.Halted ->
+                 raise (Api_errors.Server_error(Api_errors.vm_halted, [ 
Ref.string_of self ]))
+       | Xal.Shutdown x -> vm_migrate_failed (Printf.sprintf "Domain shutdown 
for unexpected reason: %d" x)
+  end else 
     vm_migrate_failed "Failed to receive suspend acknowledgement within 
timeout period or an abort was requested."
 
 (* ------------------------------------------------------------------- *)
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/domain.mli
--- a/ocaml/xenops/domain.mli   Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/domain.mli   Thu Dec 10 23:04:50 2009 +0000
@@ -85,7 +85,7 @@
 val shutdown: xs:Xs.xsh -> domid -> shutdown_reason -> unit
 
 (** Tell the domain to shutdown with reason ''shutdown_reason', waiting for an 
ack *)
-val shutdown_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh -> domid -> 
shutdown_reason -> bool
+val shutdown_wait_for_ack: ?timeout:float -> xc:Xc.handle -> xs:Xs.xsh -> 
domid -> shutdown_reason -> unit
 
 (** send a domain a sysrq *)
 val sysrq: xs:Xs.xsh -> domid -> char -> unit
diff -r ca92f46da128 -r 9edc8c86f01d ocaml/xenops/xenops.ml
--- a/ocaml/xenops/xenops.ml    Thu Dec 10 23:04:49 2009 +0000
+++ b/ocaml/xenops/xenops.ml    Thu Dec 10 23:04:50 2009 +0000
@@ -76,7 +76,12 @@
        printf "built hvm domain: %u\n" domid
 
 let clean_shutdown_domain ~xal ~domid ~reason ~sync =
-       let acked = Domain.shutdown_ack (Xal.xc_of_ctx xal) (Xal.xs_of_ctx xal) 
domid reason in
+  let xc = Xal.xc_of_ctx xal in
+  let xs = Xal.xs_of_ctx xal in
+  Domain.shutdown ~xs domid reason;
+  (* Wait for any necessary acknowledgement. If we get a Watch.Timeout _ then
+        we abort early; otherwise we continue in Xal.wait_release below. *)
+  let acked = try Domain.shutdown_wait_for_ack ~xc ~xs domid reason; true with 
Watch.Timeout _ -> false in
        if not acked then (
                eprintf "domain %u didn't acknowledged shutdown\n" domid;
        ) else (
13 files changed, 497 insertions(+), 139 deletions(-)
ocaml/idl/api_errors.ml           |    3 
ocaml/idl/datamodel.ml            |    6 
ocaml/xapi/OMakefile              |    2 
ocaml/xapi/events.ml              |   20 +-
ocaml/xapi/quicktest.ml           |    1 
ocaml/xapi/quicktest_lifecycle.ml |  194 ++++++++++++++++++++++++++
ocaml/xapi/vmops.ml               |   91 ++++++------
ocaml/xapi/xapi_fist.ml           |   17 ++
ocaml/xapi/xapi_vm.ml             |  273 ++++++++++++++++++++++++++-----------
ocaml/xapi/xapi_vm.mli            |    5 
ocaml/xapi/xapi_vm_migrate.ml     |   15 +-
ocaml/xenops/domain.mli           |    2 
ocaml/xenops/xenops.ml            |    7 


Attachment: xen-api.hg-4.patch
Description: Text Data

_______________________________________________
xen-api mailing list
xen-api@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/mailman/listinfo/xen-api