# HG changeset patch # User Mike McClurg # Date 1292518952 0 # Node ID 6d0cdb76d845fc9d0f2d3a30521a7afbde2936bc # Parent 0e8d1cf6b4048cd0fc4c0154bfc27d318dfc7b98 imported patch CA-48539-XAPI_call_vdi_deactivate_during_pool-ha-disable diff --git a/ocaml/xapi/static_vdis.ml b/ocaml/xapi/static_vdis.ml --- a/ocaml/xapi/static_vdis.ml +++ b/ocaml/xapi/static_vdis.ml @@ -25,56 +25,68 @@ let static_vdis = "/opt/xensource/bin/static-vdis" (** Generate the static configuration and attach the VDI now *) -let permanent_vdi_attach ~__context ~vdi ~reason = - info "permanent_vdi_attach: vdi = %s; sr = %s" - (Ref.string_of vdi) (Ref.string_of (Db.VDI.get_SR ~__context ~self:vdi)); - Helpers.call_script static_vdis - [ "add"; Db.VDI.get_uuid ~__context ~self:vdi; reason ]; - (* VDI will be attached on next boot; attach it now too *) - String.rtrim (Helpers.call_script static_vdis - [ "attach"; Db.VDI.get_uuid ~__context ~self:vdi ]) +let permanent_vdi_attach ~__context ~vdi ~reason = + info "permanent_vdi_attach: vdi = %s; sr = %s" + (Ref.string_of vdi) (Ref.string_of (Db.VDI.get_SR ~__context ~self:vdi)); + ignore (Helpers.call_script static_vdis [ "add"; Db.VDI.get_uuid ~__context ~self:vdi; reason ]); + (* VDI will be attached on next boot; attach it now too *) + String.rtrim (Helpers.call_script static_vdis + [ "attach"; Db.VDI.get_uuid ~__context ~self:vdi ]) - (** Detach the VDI (by reference) now and destroy the static configuration *) -let permanent_vdi_detach ~__context ~vdi = - info "permanent_vdi_detach: vdi = %s; sr = %s" - (Ref.string_of vdi) (Ref.string_of (Db.VDI.get_SR ~__context ~self:vdi)); - Sm.call_sm_vdi_functions ~__context ~vdi - (fun srconf srtype sr -> Sm.vdi_detach srconf srtype sr vdi); - ignore(Helpers.call_script static_vdis - [ "del"; Db.VDI.get_uuid ~__context ~self:vdi ]) +(** Detach the VDI (by reference) now and destroy the static configuration *) +let permanent_vdi_detach ~__context ~vdi = + info "permanent_vdi_detach: vdi = %s; sr = %s" + (Ref.string_of vdi) (Ref.string_of (Db.VDI.get_SR ~__context ~self:vdi)); + Sm.call_sm_vdi_functions ~__context ~vdi + (fun srconf srtype sr -> Sm.vdi_detach srconf srtype sr vdi); + ignore(Helpers.call_script static_vdis + [ "del"; Db.VDI.get_uuid ~__context ~self:vdi ]) (** Detach the VDI (by uuid) now and destroy the static configuration *) -let permanent_vdi_detach_by_uuid ~__context ~uuid = - info "permanent_vdi_detach: vdi-uuid = %s" uuid; - begin - try - (* This might fail because the VDI has been destroyed *) - let vdi = Db.VDI.get_by_uuid ~__context ~uuid in - Sm.call_sm_vdi_functions ~__context ~vdi - (fun srconf srtype sr -> Sm.vdi_detach srconf srtype sr vdi) - with e -> - warn "Ignoring exception calling SM vdi_detach for VDI uuid %s: %s (possibly VDI has been deleted while we were offline" uuid (ExnHelper.string_of_exn e) - end; - ignore(Helpers.call_script static_vdis [ "del"; uuid ]) +let permanent_vdi_detach_by_uuid ~__context ~uuid = + info "permanent_vdi_detach: vdi-uuid = %s" uuid; + begin + try + (* This might fail because the VDI has been destroyed *) + let vdi = Db.VDI.get_by_uuid ~__context ~uuid in + Sm.call_sm_vdi_functions ~__context ~vdi + (fun srconf srtype sr -> Sm.vdi_detach srconf srtype sr vdi) + with e -> + warn "Ignoring exception calling SM vdi_detach for VDI uuid %s: %s (possibly VDI has been deleted while we were offline" uuid (ExnHelper.string_of_exn e) + end; + ignore(Helpers.call_script static_vdis [ "del"; uuid ]) + +(** Added for CA-48539. Deactivates a vdi. You should probably follow + this call with one of the previous vdi_detach functions. *) +let permanent_vdi_deactivate_by_uuid ~__context ~uuid = + info "permanent_vdi_detach: vdi-uuid = %s" uuid ; + try + let vdi = Db.VDI.get_by_uuid ~__context ~uuid in + Sm.call_sm_vdi_functions ~__context ~vdi + (fun srconf srtype sr -> Sm.vdi_deactivate srconf srtype sr vdi) + with e -> + warn "Ignoring exception calling SM vdi_deactivate for VDI uuid %s: %s (possibly VDI has been deleted while we were offline" + uuid + (ExnHelper.string_of_exn e) (** Detaches and removes records for VDIs which have been deleted *) -let gc () = - Server_helpers.exec_with_new_task "GCing on-boot VDIs" (fun __context -> - List.iter - (fun vdi -> - let exists = try ignore(Db.VDI.get_by_uuid ~__context ~uuid:vdi.uuid); true with _ -> false in - if not(exists) then begin - warn "static-vdi %s cannot be found in database; removing on-boot configuration" vdi.uuid; - (* NB we can't call the SM functions since the record has gone *) - ignore(Helpers.call_script static_vdis [ "del"; vdi.uuid ]) - end - ) (list ())) +let gc () = + Server_helpers.exec_with_new_task "GCing on-boot VDIs" (fun __context -> + List.iter + (fun vdi -> + let exists = try ignore(Db.VDI.get_by_uuid ~__context ~uuid:vdi.uuid); true with _ -> false in + if not(exists) then begin + warn "static-vdi %s cannot be found in database; removing on-boot configuration" vdi.uuid; + (* NB we can't call the SM functions since the record has gone *) + ignore(Helpers.call_script static_vdis [ "del"; vdi.uuid ]) + end + ) (list ())) (** If we just rebooted and failed to attach our static VDIs then this can be called to reattempt the attach: - this is necessary for HA to start. *) -let reattempt_on_boot_attach () = - let script = "/etc/init.d/attach-static-vdis" in - try - ignore(Helpers.call_script script [ "start" ]) - with e -> - warn "Attempt to reattach static VDIs via '%s start' failed: %s" script (ExnHelper.string_of_exn e) + this is necessary for HA to start. *) +let reattempt_on_boot_attach () = + let script = "/etc/init.d/attach-static-vdis" in + try + ignore(Helpers.call_script script [ "start" ]) + with e -> + warn "Attempt to reattach static VDIs via '%s start' failed: %s" script (ExnHelper.string_of_exn e) diff --git a/ocaml/xapi/static_vdis_list.ml b/ocaml/xapi/static_vdis_list.ml --- a/ocaml/xapi/static_vdis_list.ml +++ b/ocaml/xapi/static_vdis_list.ml @@ -33,7 +33,7 @@ let path = Filename.concat main_dir x in let uuid = Unixext.string_of_file (Filename.concat path "vdi-uuid") in let reason = Unixext.string_of_file (Filename.concat path "reason") in - let bool_of_string x = String.lowercase x = "true" in + (* let bool_of_string x = String.lowercase x = "true" in *) let delete_next_boot = try ignore(Unix.stat (Filename.concat path "delete-next-boot")); true with _ -> false in diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -50,7 +50,7 @@ let call_script ?log_successful_output script args = try Mutex.execute ha_script_m (fun () -> Helpers.call_script ?log_successful_output script args) - with Forkhelpers.Spawn_internal_error(stderr, stdout, Unix.WEXITED n) as e -> + with Forkhelpers.Spawn_internal_error(stderr, stdout, Unix.WEXITED n) -> let code = Xha_errno.of_int n in warn "%s %s returned %s (%s)" script (String.concat " " args) (Xha_errno.to_string code) (Xha_errno.to_description_string code); @@ -97,7 +97,7 @@ String_unmarshall_helper.map (fun x -> x) (fun x -> x) v (** Without using the Pool's database, returns the IP address of a particular host - named by UUID. *) + named by UUID. *) let address_of_host_uuid uuid = let table = get_uuid_to_ip_mapping () in if not(List.mem_assoc uuid table) then begin @@ -106,8 +106,8 @@ end else List.assoc uuid table (** Without using the Pool's database, returns the UUID of a particular host named by - heartbeat IP address. This is only necesary because the liveset info doesn't include - the host IP address *) + heartbeat IP address. This is only necesary because the liveset info doesn't include + the host IP address *) let uuid_of_host_address address = let table = List.map (fun (k, v) -> v, k) (get_uuid_to_ip_mapping ()) in if not(List.mem_assoc address table) then begin @@ -116,13 +116,13 @@ end else List.assoc address table (** Called in two circumstances: - 1. When I started up I thought I was the master but my proposal was rejected by the - heartbeat component. - 2. I was happily running as someone's slave but they left the liveset. + 1. When I started up I thought I was the master but my proposal was rejected by the + heartbeat component. + 2. I was happily running as someone's slave but they left the liveset. *) let on_master_failure () = (* The plan is: keep asking if I should be the master. If I'm rejected then query the - live set and see if someone else has been marked as master, if so become a slave of them. *) + live set and see if someone else has been marked as master, if so become a slave of them. *) let become_master () = info "This node will become the master"; @@ -391,7 +391,7 @@ let process_liveset_on_master liveset = let pool = Helpers.get_pool ~__context in let to_tolerate = Int64.to_int (Db.Pool.get_ha_host_failures_to_tolerate ~__context ~self:pool) in - let planned_for = Int64.to_int (Db.Pool.get_ha_plan_exists_for ~__context ~self:pool) in + (* let planned_for = Int64.to_int (Db.Pool.get_ha_plan_exists_for ~__context ~self:pool) in *) (* First consider whether VM failover actions need to happen. Convert the liveset into a list of Host references used by the VM failover code *) @@ -725,13 +725,13 @@ (** Called when xapi restarts: server may be in emergency mode at this point. We need - to inspect the local configuration and if HA is supposed to be armed we need to - set everything up. - Note that - the master shouldn't be able to activate HA while we are offline since that would cause - us to come up with a broken configuration (the enable-HA stage has the critical task of - synchronising the HA configuration on all the hosts). So really we only want to notice - if the Pool has had HA disabled while we were offline. *) + to inspect the local configuration and if HA is supposed to be armed we need to + set everything up. + Note that + the master shouldn't be able to activate HA while we are offline since that would cause + us to come up with a broken configuration (the enable-HA stage has the critical task of + synchronising the HA configuration on all the hosts). So really we only want to notice + if the Pool has had HA disabled while we were offline. *) let on_server_restart () = let armed = bool_of_string (Localdb.get Constants.ha_armed) in @@ -760,7 +760,7 @@ let (_ : string) = call_script ha_start_daemon [] in finished := true; with - | Xha_error Xha_errno.Mtc_exit_daemon_is_present as e -> + | Xha_error Xha_errno.Mtc_exit_daemon_is_present -> warn "ha_start_daemon failed with MTC_EXIT_DAEMON_IS_PRESENT: continuing with startup"; finished := true; | Xha_error Xha_errno.Mtc_exit_invalid_pool_state as e -> @@ -826,8 +826,8 @@ end (** Called in the master xapi startup when the database is ready. We set all hosts (including this one) to - disabled then signal the monitor thread to look. It can then wait for slaves to turn up - before trying to restart VMs. *) + disabled then signal the monitor thread to look. It can then wait for slaves to turn up + before trying to restart VMs. *) let on_database_engine_ready () = info "Setting all hosts to dead and disabled. Hosts must re-enable themselves explicitly"; Server_helpers.exec_with_new_task "Setting all hosts to dead and disabled" @@ -846,7 +846,7 @@ (* Internal API calls to configure individual hosts *) (** Internal API call to prevent this node making an unsafe failover decision. - This call is idempotent. *) + This call is idempotent. *) let ha_disable_failover_decisions __context localhost = debug "Disabling failover decisions"; (* FIST *) @@ -857,8 +857,8 @@ Localdb.put Constants.ha_disable_failover_decisions "true" (** Internal API call to disarm localhost. - If the daemon is missing then we return success. Either fencing was previously disabled and the - daemon has shutdown OR the daemon has died and this node will fence shortly... + If the daemon is missing then we return success. Either fencing was previously disabled and the + daemon has shutdown OR the daemon has died and this node will fence shortly... *) let ha_disarm_fencing __context localhost = try @@ -870,7 +870,7 @@ let (_ : string) = call_script ha_set_excluded [] in () (** Internal API call to stop the HA daemon. - This call is idempotent. *) + This call is idempotent. *) let ha_stop_daemon __context localhost = Monitor.stop (); let (_ : string) = call_script ha_stop_daemon [] in () @@ -898,29 +898,38 @@ (* Might not be able to access the database to detach statefiles; however this isn't critical *) () -(** Internal API call to release any HA resources after the system has been shutdown. - This call is idempotent. *) +(** Internal API call to release any HA resources after the system has + been shutdown. This call is idempotent. Modified for CA-48539 to + call vdi.deactivate before vdi.detach. *) let ha_release_resources __context localhost = Monitor.stop (); - (* Detach any statefile VDIs *) - let pool = Helpers.get_pool ~__context in - List.iter - (fun vdi -> - let uuid = Db.VDI.get_uuid ~__context ~self:vdi in - Helpers.log_exn_continue - (Printf.sprintf "detaching statefile VDI uuid: %s" uuid) - (fun () -> Static_vdis.permanent_vdi_detach ~__context ~vdi) () - ) (List.map Ref.of_string (Db.Pool.get_ha_statefiles ~__context ~self:pool)); - (* Detach any metadata VDIs *) - Xha_metadata_vdi.detach_existing ~__context; + (* Why aren't we calling Xha_statefile.detach_existing_statefiles? + Does Db.Pool.get_ha_statefiles return a different set of + statefiles than Xha_statefile.list_existing_statefiles? *) + + (* Deactivate and detach all statefile VDIs in the entire pool *) + let statefile_vdis = Db.Pool.get_ha_statefiles ~__context ~self:(Helpers.get_pool ~__context) + and deactiavte_and_detach_vdi vdi_str = + let uuid = Db.VDI.get_uuid ~__context ~self:(Ref.of_string vdi_str) in + Helpers.log_exn_continue + (Printf.sprintf "detaching statefile VDI uuid: %s" uuid) + (fun () -> + Static_vdis.permanent_vdi_deactivate_by_uuid ~__context ~uuid ; + Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid) () + in List.iter deactiavte_and_detach_vdi statefile_vdis ; + + (* Deactivate and detach any metadata VDIs *) + Helpers.log_exn_continue + (Printf.sprintf "deactivating and detaching metadata VDIs") + (fun () -> Xha_metadata_vdi.deactivate_and_detach_existing ~__context) (); (* At this point a restart won't enable the HA subsystem *) Localdb.put Constants.ha_armed "false" (** Internal API call which blocks until this node's xHA daemon spots the invalid statefile - and exits cleanly. If the daemon survives but the statefile access is lost then this function - will return an exception and the no-statefile shutdown can be attempted. + and exits cleanly. If the daemon survives but the statefile access is lost then this function + will return an exception and the no-statefile shutdown can be attempted. *) let ha_wait_for_shutdown_via_statefile __context localhost = try @@ -978,7 +987,7 @@ let write_config_file ~__context statevdi_paths generation = let local_heart_beat_interface = Xapi_inventory.lookup Xapi_inventory._management_interface in (* Need to find the name of the physical interface, so xHA can monitor the bonding status (if appropriate). - Note that this interface isn't used for sending packets so VLANs don't matter: the physical NIC or bond device is all we need. *) + Note that this interface isn't used for sending packets so VLANs don't matter: the physical NIC or bond device is all we need. *) let localhost = Helpers.get_localhost ~__context in let mgmt_pifs = List.filter (fun self -> Db.PIF.get_management ~__context ~self) (Db.Host.get_PIFs ~__context ~self:localhost) in if mgmt_pifs = [] then failwith (Printf.sprintf "Cannot enable HA on host %s: there is no management interface for heartbeating" (Db.Host.get_hostname ~__context ~self:localhost)); @@ -1031,7 +1040,7 @@ Db.Host.set_ha_statefiles ~__context ~self:localhost ~value:(List.map Ref.string_of statevdis); (* The master has already attached the statefile VDIs and written the - configuration file. *) + configuration file. *) if not(Pool_role.is_master ()) then begin let statefiles = attach_statefiles ~__context statevdis in write_config_file ~__context statefiles generation; @@ -1053,9 +1062,9 @@ info "Local flag ha_armed <- true"; (* If this host is the current master then it must assert its authority as master; - otherwise another host's heartbeat thread might conclude that the master has gone - and propose itself. This would lead the xHA notion of master to immediately diverge - from the XenAPI notion. *) + otherwise another host's heartbeat thread might conclude that the master has gone + and propose itself. This would lead the xHA notion of master to immediately diverge + from the XenAPI notion. *) if Pool_role.is_master () then begin if not (propose_master ()) then failwith "failed to propose the current master as master"; @@ -1189,13 +1198,13 @@ redo_log_ha_disabled_during_runtime __context; (* Steps from 8.6 Disabling HA - If the master has access to the state file (how do we determine this)? - * ha_set_pool_state(invalid) - If the master hasn't access to the state file but all hosts are available via heartbeat - * set the flag "can not be master and no VM failover decision on next boot" - * ha_disarm_fencing() - * ha_stop_daemon() - Otherwise we'll be fenced *) + If the master has access to the state file (how do we determine this)? + * ha_set_pool_state(invalid) + If the master hasn't access to the state file but all hosts are available via heartbeat + * set the flag "can not be master and no VM failover decision on next boot" + * ha_disarm_fencing() + * ha_stop_daemon() + Otherwise we'll be fenced *) let hosts = Db.Host.get_all ~__context in @@ -1380,7 +1389,7 @@ (List.map (fun (pif,pifr) -> Ref.string_of pif) unplugged_ununpluggable_pifs))); (* Check also that any PIFs with IP information set are currently attached - it's a non-fatal - error if they are, but we'll warn with a message *) + error if they are, but we'll warn with a message *) let pifs_with_ip_config = List.filter (fun (_,pifr) -> pifr.API.pIF_ip_configuration_mode <> `None) pifs in let not_bond_slaves = List.filter (fun (_,pifr) -> not (Db.is_valid_ref pifr.API.pIF_bond_slave_of)) pifs_with_ip_config in let without_disallow_unplug = List.filter (fun (_,pifr) -> not (pifr.API.pIF_disallow_unplug || pifr.API.pIF_management)) not_bond_slaves in @@ -1406,7 +1415,7 @@ if not alive then raise (Api_errors.Server_error(Api_errors.host_offline, [ Ref.string_of host ])) ) (Db.Host.get_all ~__context); - let set_difference a b = List.filter (fun x -> not(List.mem x b)) a in + (* let set_difference a b = List.filter (fun x -> not(List.mem x b)) a in *) (* Steps from 8.7 Enabling HA in Marathon spec: * 1. Bring up state file VDI(s) diff --git a/ocaml/xapi/xha_metadata_vdi.ml b/ocaml/xapi/xha_metadata_vdi.ml --- a/ocaml/xapi/xha_metadata_vdi.ml +++ b/ocaml/xapi/xha_metadata_vdi.ml @@ -57,6 +57,12 @@ let vdis = list_existing() in List.iter (fun x -> Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid:x.Static_vdis.uuid) vdis +(** Added for CA-48539 *) +let deactivate_and_detach_existing ~__context = + let vdi_uuids = List.map (fun vdi -> vdi.Static_vdis.uuid) (list_existing ()) in + List.iter (fun vdi_uuid -> Static_vdis.permanent_vdi_deactivate_by_uuid ~__context ~uuid:vdi_uuid) vdi_uuids ; + List.iter (fun vdi_uuid -> Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid:vdi_uuid) vdi_uuids + open Pervasiveext (** Attempt to flush the database to the metadata VDI *) diff --git a/ocaml/xapi/xha_statefile.ml b/ocaml/xapi/xha_statefile.ml --- a/ocaml/xapi/xha_statefile.ml +++ b/ocaml/xapi/xha_statefile.ml @@ -28,84 +28,90 @@ open Listext open Stringext -(** Return the minimum size of an HA statefile, as of +(** Return the minimum size of an HA statefile, as of XenServer HA state-file description vsn 1.3 *) -let minimum_size number_of_hosts = - let ( ** ) = Int64.mul - and ( ++ ) = Int64.add in +let minimum_size number_of_hosts = + let ( ** ) = Int64.mul + and ( ++ ) = Int64.add in - let global_section_size = 4096L - and host_section_size = 4096L in - global_section_size ++ (Int64.of_int number_of_hosts) ** host_section_size + let global_section_size = 4096L + and host_section_size = 4096L in + global_section_size ++ (Int64.of_int number_of_hosts) ** host_section_size let set_difference a b = List.filter (fun x -> not(List.mem x b)) a -let assert_sr_can_host_statefile ~__context ~sr = - (* Check that each host has a PBD to this SR *) - let pbds = Db.SR.get_PBDs ~__context ~self:sr in - let connected_hosts = List.setify (List.map (fun self -> Db.PBD.get_host ~__context ~self) pbds) in - let all_hosts = Db.Host.get_all ~__context in - if List.length connected_hosts < (List.length all_hosts) then begin - error "Cannot place statefile in SR %s: some hosts lack a PBD: [ %s ]" - (Ref.string_of sr) - (String.concat "; " (List.map Ref.string_of (set_difference all_hosts connected_hosts))); - raise (Api_errors.Server_error(Api_errors.sr_no_pbds, [ Ref.string_of sr ])) - end; - (* Check that each PBD is plugged in *) - List.iter (fun self -> - if not(Db.PBD.get_currently_attached ~__context ~self) then begin - error "Cannot place statefile in SR %s: PBD %s is not plugged" - (Ref.string_of sr) (Ref.string_of self); - (* Same exception is used in this case (see Helpers.assert_pbd_is_plugged) *) - raise (Api_errors.Server_error(Api_errors.sr_no_pbds, [ Ref.string_of sr ])) - end) pbds; - (* Check the exported capabilities of the SR's SM plugin *) - let srtype = Db.SR.get_type ~__context ~self:sr in - if not (List.mem Smint.Vdi_generate_config (Sm.capabilities_of_driver srtype)) - then raise (Api_errors.Server_error (Api_errors.sr_operation_not_supported, [Ref.string_of sr])) +let assert_sr_can_host_statefile ~__context ~sr = + (* Check that each host has a PBD to this SR *) + let pbds = Db.SR.get_PBDs ~__context ~self:sr in + let connected_hosts = List.setify (List.map (fun self -> Db.PBD.get_host ~__context ~self) pbds) in + let all_hosts = Db.Host.get_all ~__context in + if List.length connected_hosts < (List.length all_hosts) then begin + error "Cannot place statefile in SR %s: some hosts lack a PBD: [ %s ]" + (Ref.string_of sr) + (String.concat "; " (List.map Ref.string_of (set_difference all_hosts connected_hosts))); + raise (Api_errors.Server_error(Api_errors.sr_no_pbds, [ Ref.string_of sr ])) + end; + (* Check that each PBD is plugged in *) + List.iter (fun self -> + if not(Db.PBD.get_currently_attached ~__context ~self) then begin + error "Cannot place statefile in SR %s: PBD %s is not plugged" + (Ref.string_of sr) (Ref.string_of self); + (* Same exception is used in this case (see Helpers.assert_pbd_is_plugged) *) + raise (Api_errors.Server_error(Api_errors.sr_no_pbds, [ Ref.string_of sr ])) + end) pbds; + (* Check the exported capabilities of the SR's SM plugin *) + let srtype = Db.SR.get_type ~__context ~self:sr in + if not (List.mem Smint.Vdi_generate_config (Sm.capabilities_of_driver srtype)) + then raise (Api_errors.Server_error (Api_errors.sr_operation_not_supported, [Ref.string_of sr])) -let list_srs_which_can_host_statefile ~__context = - List.filter (fun sr -> try assert_sr_can_host_statefile ~__context ~sr; true - with _ -> false) (Db.SR.get_all ~__context) +let list_srs_which_can_host_statefile ~__context = + List.filter (fun sr -> try assert_sr_can_host_statefile ~__context ~sr; true + with _ -> false) (Db.SR.get_all ~__context) -let create ~__context ~sr = - assert_sr_can_host_statefile ~__context ~sr; - let size = minimum_size (List.length (Db.Host.get_all ~__context)) in - Helpers.call_api_functions ~__context - (fun rpc session_id -> - Client.VDI.create ~rpc ~session_id - ~name_label:"Statefile for HA" - ~name_description:"Used for storage heartbeating" - ~sR:sr ~virtual_size:size ~_type:`ha_statefile - ~sharable:true ~read_only:false ~other_config:[] ~xenstore_data:[] ~sm_config:statefile_sm_config ~tags:[] - ) +let create ~__context ~sr = + assert_sr_can_host_statefile ~__context ~sr; + let size = minimum_size (List.length (Db.Host.get_all ~__context)) in + Helpers.call_api_functions ~__context + (fun rpc session_id -> + Client.VDI.create ~rpc ~session_id + ~name_label:"Statefile for HA" + ~name_description:"Used for storage heartbeating" + ~sR:sr ~virtual_size:size ~_type:`ha_statefile + ~sharable:true ~read_only:false ~other_config:[] ~xenstore_data:[] ~sm_config:statefile_sm_config ~tags:[] + ) (** Return a reference to a valid statefile VDI in the given SR. This function prefers to reuse existing VDIs to avoid confusing the heartbeat component: it expects to see a poisoned VDI but not necessarily a stale or corrupted one. Consider that when using LVM-based SRs the VDI could be deleted on the master but the slaves would still have access to stale data. *) -let find_or_create ~__context ~sr = - assert_sr_can_host_statefile ~__context ~sr; - let size = minimum_size (List.length (Db.Host.get_all ~__context)) in - match - List.filter - (fun self -> true - && (Db.VDI.get_type ~__context ~self = `ha_statefile) - && (Db.VDI.get_virtual_size ~__context ~self >= size)) - (Db.SR.get_VDIs ~__context ~self:sr) with - | x :: _ -> - info "re-using existing statefile: %s" (Db.VDI.get_uuid ~__context ~self:x); - x - | [] -> - info "no suitable existing statefile found; creating a fresh one"; - create ~__context ~sr +let find_or_create ~__context ~sr = + assert_sr_can_host_statefile ~__context ~sr; + let size = minimum_size (List.length (Db.Host.get_all ~__context)) in + match + List.filter + (fun self -> true + && (Db.VDI.get_type ~__context ~self = `ha_statefile) + && (Db.VDI.get_virtual_size ~__context ~self >= size)) + (Db.SR.get_VDIs ~__context ~self:sr) with + | x :: _ -> + info "re-using existing statefile: %s" (Db.VDI.get_uuid ~__context ~self:x); + x + | [] -> + info "no suitable existing statefile found; creating a fresh one"; + create ~__context ~sr -let list_existing_statefiles () = - List.filter (fun x -> x.Static_vdis.reason = reason) (Static_vdis.list ()) +let list_existing_statefiles () = + List.filter (fun x -> x.Static_vdis.reason = reason) (Static_vdis.list ()) (** Detach all statefiles attached with reason 'HA statefile', to clear stale state *) -let detach_existing_statefiles ~__context = - let statefiles = List.filter (fun x -> x.Static_vdis.reason = reason) (Static_vdis.list ()) in - List.iter (fun x -> Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid:x.Static_vdis.uuid) statefiles - +let detach_existing_statefiles ~__context = + let statefile_uuids = List.map (fun vdi -> vdi.Static_vdis.uuid) (list_existing_statefiles ()) in + List.iter (fun uuid -> Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid) statefile_uuids + +(** Added for CA-48539. Deactivate and detach all statefiles attached + with reason 'HA statefile', to clear stale state *) +let deactivate_and_detach_existing_statefiles ~__context = + let statefile_uuids = List.map (fun vdi -> vdi.Static_vdis.uuid) (list_existing_statefiles ()) in + List.iter (fun uuid -> Static_vdis.permanent_vdi_deactivate_by_uuid ~__context ~uuid) statefile_uuids ; + List.iter (fun uuid -> Static_vdis.permanent_vdi_detach_by_uuid ~__context ~uuid) statefile_uuids