[Skiboot] [PATCH 04/16] core/pci: Train link of PHB slots when hotplugging

Frederic Barrat fbarrat at linux.ibm.com
Wed Oct 2 03:27:05 AEST 2019



Le 01/10/2019 à 14:02, Oliver O'Halloran a écrit :
> On Mon, 2019-09-09 at 14:31 +0200, Frederic Barrat wrote:
>> The link of PHB slots must be trained after powering on. This can be
>> done by calling the fundamental reset callback of the slot.
>>
>> We could force a reset for all the slots and have a common path in
>> set_power_state(). But this patch only resets the PHB slot. Some slot
>> implementations do a power cycle during fundamental reset, so calling
>> a reset after powering on would repeat that operation.
>>
>> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
>> ---
>>   core/pci-opal.c | 130 ++++++++++++++++++++++++++++++++++++++++--------
>>   1 file changed, 108 insertions(+), 22 deletions(-)
>>
>> diff --git a/core/pci-opal.c b/core/pci-opal.c
>> index 69bd65c6..6c070030 100644
>> --- a/core/pci-opal.c
>> +++ b/core/pci-opal.c
>> @@ -678,11 +678,111 @@ static void remove_slot_devices(struct pci_slot *slot)
>>   	pci_remove_bus(phb, &pd->children);
>>   }
>>   
>> +static void link_training_timer(struct timer *t, void *data,
>> +				uint64_t now __unused)
>> +{
>> +	struct pci_slot *slot = data;
>> +	struct phb *phb = slot->phb;
>> +	uint8_t link;
>> +	int64_t rc = 0;
>> +
>> +	phb_lock(phb);
>> +
>> +	rc = slot->ops.run_sm(slot);
>> +	if (rc < 0)
>> +		goto out;
>> +	if (rc > 0) {
>> +		schedule_timer(t, rc);
>> +		phb_unlock(phb);
>> +		return;
>> +	}
>> +
>> +	if (slot->ops.get_link_state(slot, &link) != OPAL_SUCCESS)
>> +		link = 0;
>> +	if (!link) {
>> +		rc = OPAL_HARDWARE;
>> +		goto out;
>> +	}
>> +
>> +	rescan_slot_devices(slot);
>> +out:
>> +	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>> +		       slot->async_token, get_slot_phandle(slot),
>> +		       slot->power_state, rc <= 0 ? rc : OPAL_BUSY);
>> +	phb_unlock(phb);
>> +}
>> +
>> +static void link_up_timer(struct timer *t, void *data, uint64_t now __unused)
>> +{
>> +	struct pci_slot *slot = data;
>> +	struct phb *phb = slot->phb;
>> +	uint8_t link;
>> +	int64_t rc = OPAL_SUCCESS;
>> +
>> +	phb_lock(phb);
>> +	if (slot->ops.get_link_state(slot, &link) != OPAL_SUCCESS)
>> +		link = 0;
>> +	if (!link) {
>> +		if (slot->retries-- == 0) {
>> +			rc = OPAL_BUSY;
>> +			goto out;
>> +		} else {
>> +			schedule_timer(t, msecs_to_tb(10));
>> +			phb_unlock(phb);
>> +			return;
>> +		}
>> +	}
>> +
>> +	rescan_slot_devices(slot);
>> +out:
>> +	opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>> +		       slot->async_token, get_slot_phandle(slot),
>> +		       slot->power_state, rc <= 0 ? rc : OPAL_BUSY);
>> +	phb_unlock(phb);
>> +}
>> +
> 
>> +static bool training_needed(struct pci_slot *slot)
>> +{
>> +	struct phb *phb = slot->phb;
>> +	struct pci_device *pd = slot->pd;
>> +
>> +	if ((pd && !pd->parent) || /* "real" PHB */
>> +	    (!pd && phb->phb_type == phb_type_npu_v2_opencapi))
>> +		return true;
>> +	return false;
>> +}
> 
> Any reason why training_needed() couldn't just be a link state check?
> That's basicly what's happening here anyway with a one timer tick wait
> added
> 
>   If that wait is important for some reason then that should be
> reflected via a state machine transition rather.


At least in the opencapi case, where the power of the adapter stays on, 
the link is still up at the time. The card is fenced and in reset, but 
the link will drop when we release the reset (due to the implementation 
of the DL on the fpga). So reading the link state would tell everything 
is fine, when we know we'll need to retrain the link.

For the cases were we don't retrain and just wait, the 1 time tick 
doesn't matter, we'll loop in link_up_timer() anyway. I guess I could 
default it to something like the equivalent if 1ms which could save us 
one iteration of the loop?


>> +static void wait_for_link_up_and_rescan(struct pci_slot *slot)
>> +{
>> +	int64_t rc;
>> +
>> +	/*
>> +	 * Links for PHB slots need to be retrained by triggering a
>> +	 * fundamental reset. Other slots also need to be tested for
>> +	 * readiness
>> +	 */
>> +	if (training_needed(slot)) {
>> +		rc = slot->ops.freset(slot);
>> +		if (rc < 0) {
>> +			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>> +				       slot->async_token,
>> +				       get_slot_phandle(slot),
>> +				       slot->power_state,
>> +				       rc <= 0 ? rc : OPAL_BUSY);
>> +			return;
>> +		}
>> +		init_timer(&slot->timer, link_training_timer, slot);
>> +	} else {
>> +		init_timer(&slot->timer, link_up_timer, slot);
>> +		rc = 1;
>> +	}
>> +	schedule_timer(&slot->timer, rc);
>> +}
> 
> This doesn't make much sense to me. The only difference between the two
> paths here is that in the !training_needed() case we call link_up_timer
> which polls the link state with ops.get_link_state() rather than
> cranking the state machine. However, this is more or less the scenario
> that the POLL_LINK state is for, so... why not use that?


For the case we need to retrain, we need to crank the state machine as 
we'll likely need more than one call to ops.freset(), which will end in 
ops.poll_link().
For the case where we don't retrain, are you suggesting that setting the 
slot state to PCI_SLOT_STATE_LINK_START_POLL and crank the state machine 
would do? My understanding is that when polling the link, skiboot is bit 
more "active" then just waiting in get_link_state()

   Fred



>>   static void set_power_timer(struct timer *t __unused, void *data,
>>   			    uint64_t now __unused)
>>   {
>>   	struct pci_slot *slot = data;
>> -	uint8_t link;
>>   	struct phb *phb = slot->phb;
>>   
>>   	phb_lock(phb);
>> @@ -699,9 +799,9 @@ static void set_power_timer(struct timer *t __unused, void *data,
>>   
>>   		break;
>>   	case PCI_SLOT_STATE_SPOWER_DONE:
>> +		pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
>>   		if (slot->power_state == OPAL_PCI_SLOT_POWER_OFF) {
>>   			remove_slot_devices(slot);
>> -			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
>> 			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>>   				       slot->async_token, get_slot_phandle(slot),
>>   				       OPAL_PCI_SLOT_POWER_OFF, OPAL_SUCCESS);
>> @@ -709,23 +809,7 @@ static void set_power_timer(struct timer *t __unused, void *data,
>>   		}
>>   
>>   		/* Power on */
>> -		if (slot->ops.get_link_state(slot, &link) != OPAL_SUCCESS)
>> -			link = 0;
>> -		if (link) {
>> -			rescan_slot_devices(slot);
>> -			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
>> -			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>> -				       slot->async_token, get_slot_phandle(slot),
>> -				       OPAL_PCI_SLOT_POWER_ON, OPAL_SUCCESS);
>> -		} else if (slot->retries-- == 0) {
>> -			pci_slot_set_state(slot, PCI_SLOT_STATE_NORMAL);
>> -			opal_queue_msg(OPAL_MSG_ASYNC_COMP, NULL, NULL,
>> -				       slot->async_token, get_slot_phandle(slot),
>> -				       OPAL_PCI_SLOT_POWER_ON, OPAL_BUSY);
>> -		} else {
>> -			schedule_timer(&slot->timer, msecs_to_tb(10));
>> -		}
>> -
>> +		wait_for_link_up_and_rescan(slot);
>>   		break;
>>   	default:
>>   		prlog(PR_ERR, "PCI SLOT %016llx: Unexpected state 0x%08x\n",
>> @@ -808,10 +892,12 @@ static int64_t opal_pci_set_power_state(uint64_t async_token,
>>   		init_timer(&slot->timer, set_power_timer, slot);
>>   		schedule_timer(&slot->timer, msecs_to_tb(10));
>>   	} else if (rc == OPAL_SUCCESS) {
>> -		if (*state == OPAL_PCI_SLOT_POWER_OFF)
>> +		if (*state == OPAL_PCI_SLOT_POWER_OFF) {
>>   			remove_slot_devices(slot);
>> -		else
>> -			rescan_slot_devices(slot);
>> +		} else {
>> +			wait_for_link_up_and_rescan(slot);
>> +			rc = OPAL_ASYNC_COMPLETION;
>> +		}
>>   	}
>>   
>>   	phb_unlock(phb);
> 



More information about the Skiboot mailing list